aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.github/workflows/build-ci-container-windows.yml2
-rw-r--r--.github/workflows/build-ci-container.yml2
-rw-r--r--clang/include/clang/Basic/BuiltinsAMDGPU.def2
-rw-r--r--clang/include/clang/Sema/Overload.h2
-rw-r--r--clang/lib/Sema/SemaOverload.cpp49
-rw-r--r--clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl91
-rw-r--r--clang/test/Sema/dllexport.c4
-rw-r--r--clang/test/SemaCXX/overload-resolution-deferred-templates.cpp28
-rw-r--r--flang-rt/lib/runtime/descriptor.cpp2
-rw-r--r--flang/lib/Lower/Bridge.cpp15
-rw-r--r--flang/lib/Lower/ConvertConstant.cpp2
-rw-r--r--flang/lib/Lower/ConvertExpr.cpp26
-rw-r--r--flang/lib/Lower/Runtime.cpp4
-rw-r--r--flang/lib/Lower/VectorSubscripts.cpp2
-rw-r--r--flang/lib/Optimizer/Builder/FIRBuilder.cpp2
-rw-r--r--flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp6
-rw-r--r--flang/lib/Optimizer/Builder/Runtime/Stop.cpp2
-rw-r--r--flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp4
-rw-r--r--flang/lib/Optimizer/Transforms/MemoryUtils.cpp2
-rw-r--r--flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp2
-rw-r--r--flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp2
-rw-r--r--flang/lib/Semantics/check-do-forall.cpp4
-rw-r--r--flang/test/Semantics/PowerPC/ppc-vector-types01.f902
-rw-r--r--flang/test/Semantics/PowerPC/ppc-vector-types02.f902
-rw-r--r--flang/test/Semantics/resolve40.f907
-rw-r--r--flang/unittests/Optimizer/FortranVariableTest.cpp6
-rw-r--r--libc/cmake/modules/LLVMLibCTestRules.cmake17
-rw-r--r--libc/include/dirent.h.def16
-rw-r--r--libc/include/dirent.yaml18
-rw-r--r--libc/include/llvm-libc-macros/math-macros.h101
-rw-r--r--libc/include/search.h.def18
-rw-r--r--libc/include/search.yaml24
-rw-r--r--libc/include/setjmp.h.def16
-rw-r--r--libc/include/setjmp.yaml10
-rw-r--r--libc/include/spawn.h.def16
-rw-r--r--libc/include/spawn.yaml18
-rw-r--r--libc/include/string.h.def16
-rw-r--r--libc/include/string.yaml17
-rw-r--r--libc/include/strings.h.def16
-rw-r--r--libc/include/strings.yaml31
-rw-r--r--libc/include/sys/sendfile.h.def16
-rw-r--r--libc/include/sys/sendfile.yaml12
-rw-r--r--libc/include/sys/statvfs.h.def16
-rw-r--r--libc/include/sys/statvfs.yaml10
-rw-r--r--libc/include/sys/types.yaml40
-rw-r--r--libc/include/sys/uio.h.def16
-rw-r--r--libc/include/sys/uio.yaml10
-rw-r--r--libc/include/sys/utsname.h.def16
-rw-r--r--libc/include/sys/utsname.yaml8
-rw-r--r--libc/include/threads.h.def16
-rw-r--r--libc/include/threads.yaml3
-rw-r--r--libc/include/uchar.h.def16
-rw-r--r--libc/include/uchar.yaml5
-rw-r--r--libc/shared/math.h1
-rw-r--r--libc/shared/math/exp10f16.h29
-rw-r--r--libc/src/__support/macros/properties/architectures.h2
-rw-r--r--libc/src/__support/math/CMakeLists.txt34
-rw-r--r--libc/src/__support/math/exp10_float16_constants.h43
-rw-r--r--libc/src/__support/math/exp10f16.h141
-rw-r--r--libc/src/__support/math/exp10f16_utils.h64
-rw-r--r--libc/src/math/generic/CMakeLists.txt21
-rw-r--r--libc/src/math/generic/exp10f16.cpp122
-rw-r--r--libc/src/math/generic/exp10m1f16.cpp2
-rw-r--r--libc/src/math/generic/expxf16.h56
-rw-r--r--libc/test/include/CMakeLists.txt15
-rw-r--r--libc/test/include/math_constants_test.c23
-rw-r--r--libcxx/docs/ReleaseNotes/21.rst4
-rw-r--r--libcxx/include/__config9
-rw-r--r--libcxx/include/__config_site.in1
-rw-r--r--libcxx/include/limits2
-rw-r--r--libcxx/src/random.cpp26
-rw-r--r--libcxx/test/libcxx/fuzzing/random.pass.cpp7
-rw-r--r--libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp7
-rw-r--r--libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp3
-rw-r--r--libcxx/test/std/numerics/c.math/cmath.pass.cpp7
-rw-r--r--lld/ELF/BPSectionOrderer.cpp6
-rw-r--r--lld/test/ELF/bp-section-orderer.s21
-rw-r--r--lld/test/ELF/hexagon-plt.s18
-rw-r--r--lld/test/ELF/hexagon-shared.s2
-rw-r--r--lld/test/ELF/hexagon-tls-gd-xform.s4
-rw-r--r--lldb/source/DataFormatters/ValueObjectPrinter.cpp2
-rw-r--r--lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py10
-rw-r--r--llvm/Maintainers.md14
-rw-r--r--llvm/docs/CodingStandards.rst46
-rw-r--r--llvm/docs/Extensions.rst20
-rw-r--r--llvm/include/llvm/ADT/CombinationGenerator.h3
-rw-r--r--llvm/include/llvm/ADT/STLForwardCompat.h22
-rw-r--r--llvm/include/llvm/BinaryFormat/ELF.h1
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAMDGPU.td6
-rw-r--r--llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h12
-rw-r--r--llvm/include/llvm/MC/MCObjectStreamer.h13
-rw-r--r--llvm/include/llvm/MC/MCSection.h1
-rw-r--r--llvm/include/llvm/MC/MCStreamer.h4
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp22
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp2
-rw-r--r--llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp10
-rw-r--r--llvm/lib/MC/MCObjectStreamer.cpp43
-rw-r--r--llvm/lib/MC/MCParser/ELFAsmParser.cpp4
-rw-r--r--llvm/lib/MC/MCParser/MCTargetAsmParser.cpp5
-rw-r--r--llvm/lib/MC/MCSectionELF.cpp4
-rw-r--r--llvm/lib/MC/MCStreamer.cpp13
-rw-r--r--llvm/lib/Object/ELF.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td18
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td14
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td11
-rw-r--r--llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp109
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp34
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp19
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp122
-rw-r--r--llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp138
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp9
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp5
-rw-r--r--llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp8
-rw-r--r--llvm/test/CodeGen/AArch64/freeze.ll34
-rw-r--r--llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.v2i16.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll11906
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll1953
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll1308
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll2946
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll216
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll87
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll5478
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll1125
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll1075
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll841
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll302
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll302
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll302
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll161
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll486
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll57
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.bf16.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll532
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll532
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll257
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll231
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll694
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-saddr-load.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll305
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll153
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll219
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log10.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll75
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll7083
-rw-r--r--llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll7083
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.v2i16.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/i8x4-instructions.ll47
-rw-r--r--llvm/test/CodeGen/NVPTX/trunc-tofp.ll81
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll161
-rw-r--r--llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll32
-rw-r--r--llvm/test/CodeGen/RISCV/zdinx-spill.ll26
-rw-r--r--llvm/test/CodeGen/X86/vector-bitreverse.ll388
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/lifetime.ll447
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s102
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s81
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s68
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s76
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s28
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s36
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s81
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s87
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s60
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s68
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s20
-rw-r--r--llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s28
-rw-r--r--llvm/test/MC/AsmParser/llvm_section_types.s17
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt87
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt61
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt31
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt92
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt57
-rw-r--r--llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt31
-rw-r--r--llvm/test/MC/Hexagon/two_ext.s4
-rw-r--r--llvm/test/MC/RISCV/Relocations/mc-dump.s1
-rw-r--r--llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll59
-rw-r--r--llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll84
-rw-r--r--llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s47
-rw-r--r--llvm/tools/llvm-mc/Disassembler.cpp6
-rw-r--r--llvm/tools/llvm-objdump/llvm-objdump.cpp180
-rw-r--r--llvm/tools/llvm-readobj/ELFDumper.cpp2
-rw-r--r--llvm/unittests/ADT/STLForwardCompatTest.cpp25
-rw-r--r--llvm/utils/TableGen/Common/CodeGenSchedule.cpp6
-rw-r--r--llvm/utils/gn/secondary/libcxx/include/BUILD.gn1
-rw-r--r--mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h3
-rw-r--r--mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td7
-rw-r--r--mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp36
-rw-r--r--mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp15
-rw-r--r--mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt2
-rw-r--r--mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp129
-rw-r--r--mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir5
-rw-r--r--mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir283
-rw-r--r--mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir95
-rw-r--r--openmp/runtime/src/kmp.h30
-rw-r--r--openmp/runtime/src/kmp_tasking.cpp252
-rw-r--r--utils/bazel/llvm-project-overlay/libc/BUILD.bazel38
216 files changed, 30430 insertions, 26830 deletions
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 59079f0..f76c69f 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -11,8 +11,6 @@ on:
- .github/workflows/build-ci-container-windows.yml
- '.github/workflows/containers/github-action-ci-windows/**'
pull_request:
- branches:
- - main
paths:
- .github/workflows/build-ci-container-windows.yml
- '.github/workflows/containers/github-action-ci-windows/**'
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 3159aae..7f01264 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -11,8 +11,6 @@ on:
- .github/workflows/build-ci-container.yml
- '.github/workflows/containers/github-action-ci/**'
pull_request:
- branches:
- - main
paths:
- .github/workflows/build-ci-container.yml
- '.github/workflows/containers/github-action-ci/**'
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a916af7..d4fef5d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -684,6 +684,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
// GFX1250 WMMA builtins
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index a70335b..d34a414 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -1491,8 +1491,6 @@ class Sema;
OverloadingResult
BestViableFunctionImpl(Sema &S, SourceLocation Loc,
OverloadCandidateSet::iterator &Best);
- void PerfectViableFunction(Sema &S, SourceLocation Loc,
- OverloadCandidateSet::iterator &Best);
};
bool isBetterOverloadCandidate(Sema &S, const OverloadCandidate &Cand1,
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1b54628..5dd5b49 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11354,55 +11354,18 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S,
DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled;
if (TwoPhaseResolution) {
-
- PerfectViableFunction(S, Loc, Best);
- if (Best != end())
- return ResultForBestCandidate(Best);
+ OverloadingResult Res = BestViableFunctionImpl(S, Loc, Best);
+ if (Best != end() && Best->isPerfectMatch(S.Context)) {
+ if (!(HasDeferredTemplateConstructors &&
+ isa_and_nonnull<CXXConversionDecl>(Best->Function)))
+ return Res;
+ }
}
InjectNonDeducedTemplateCandidates(S);
return BestViableFunctionImpl(S, Loc, Best);
}
-void OverloadCandidateSet::PerfectViableFunction(
- Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) {
-
- Best = end();
- for (auto It = Candidates.begin(); It != Candidates.end(); ++It) {
-
- if (!It->isPerfectMatch(S.getASTContext()))
- continue;
-
- // We found a suitable conversion function
- // but if there is a template constructor in the target class
- // we might prefer that instead.
- if (HasDeferredTemplateConstructors &&
- isa_and_nonnull<CXXConversionDecl>(It->Function)) {
- Best = end();
- break;
- }
-
- if (Best == end()) {
- Best = It;
- continue;
- }
- if (Best->Function && It->Function) {
- FunctionDecl *D =
- S.getMoreConstrainedFunction(Best->Function, It->Function);
- if (D == nullptr) {
- Best = end();
- break;
- }
- if (D == It->Function)
- Best = It;
- continue;
- }
- // ambiguous
- Best = end();
- break;
- }
-}
-
OverloadingResult OverloadCandidateSet::BestViableFunctionImpl(
Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index a9ea176..a21862c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -4,6 +4,9 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+typedef unsigned int uint;
+typedef unsigned short int ushort;
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_setprio_inc_wg(
@@ -42,6 +45,24 @@ void test_s_wait_tensorcnt() {
__builtin_amdgcn_s_wait_tensorcnt(0);
}
+// CHECK-LABEL: @test_prng_b32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
+// CHECK-NEXT: ret void
+//
+void test_prng_b32(global uint* out, uint a) {
+ *out = __builtin_amdgcn_prng_b32(a);
+}
+
// CHECK-LABEL: @test_tanh_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -349,6 +370,76 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
}
+// CHECK-LABEL: @test_sat_pk4_i4_i8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]])
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2
+// CHECK-NEXT: ret void
+//
+void test_sat_pk4_i4_i8(ushort *out, uint src)
+{
+ *out = __builtin_amdgcn_sat_pk4_i4_i8(src);
+ *out = __builtin_amdgcn_sat_pk4_u4_u8(src);
+}
+
+// CHECK-LABEL: @test_permlane16_swap(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
+// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
+// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
+// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
+// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
+// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
+// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
+// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
+// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
+// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
+// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT: ret void
+//
+void test_permlane16_swap(global uint2* out, uint old, uint src) {
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
+ *out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
+}
+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/Sema/dllexport.c b/clang/test/Sema/dllexport.c
index 3f911fb..5f6ff36e 100644
--- a/clang/test/Sema/dllexport.c
+++ b/clang/test/Sema/dllexport.c
@@ -2,6 +2,10 @@
// RUN: %clang_cc1 -triple x86_64-win32 -fsyntax-only -fms-extensions -verify -std=c11 %s
// RUN: %clang_cc1 -triple i686-mingw32 -fsyntax-only -fms-extensions -verify -std=c11 %s
// RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 %s
+// RUN: %clang_cc1 -triple i686-windows-itanium -fsyntax-only -fms-extensions -verify -std=c99 %s
+// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c11 %s
+// RUN: %clang_cc1 -triple x86_64-sie-ps5 -fsyntax-only -fms-extensions -verify -std=c99 %s
+// RUN: %clang_cc1 -triple x86_64-sie-ps5 -fsyntax-only -fms-extensions -verify -std=c11 %s
// Invalid usage.
__declspec(dllexport) typedef int typedef1;
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 46c3670..135865c 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -283,3 +283,31 @@ void f() {
}
#endif
+
+namespace GH147374 {
+
+struct String {};
+template <typename T> void operator+(T, String &&) = delete;
+
+struct Bar {
+ void operator+(String) const; // expected-note {{candidate function}}
+ friend void operator+(Bar, String) {}; // expected-note {{candidate function}}
+};
+
+struct Baz {
+ void operator+(String); // expected-note {{candidate function}}
+ friend void operator+(Baz, String) {}; // expected-note {{candidate function}}
+};
+
+void test() {
+ Bar a;
+ String b;
+ a + b;
+ //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Bar' and 'String')}}
+
+ Baz z;
+ z + b;
+ //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Baz' and 'String')}}
+}
+
+}
diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp
index b723acd..e9301bd 100644
--- a/flang-rt/lib/runtime/descriptor.cpp
+++ b/flang-rt/lib/runtime/descriptor.cpp
@@ -85,7 +85,7 @@ RT_API_ATTRS void Descriptor::Establish(int characterKind,
RT_API_ATTRS void Descriptor::Establish(const typeInfo::DerivedType &dt,
void *p, int rank, const SubscriptValue *extent,
ISO::CFI_attribute_t attribute) {
- std::size_t elementBytes{dt.sizeInBytes()};
+ auto elementBytes{static_cast<std::size_t>(dt.sizeInBytes())};
ISO::EstablishDescriptor(
&raw_, p, attribute, CFI_type_struct, elementBytes, rank, extent);
if (elementBytes == 0) {
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 4241d12..5f0783f 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1466,8 +1466,9 @@ private:
assert(falseTarget && "missing conditional branch false block");
mlir::Location loc = toLocation();
mlir::Value bcc = builder->createConvert(loc, builder->getI1Type(), cond);
- builder->create<mlir::cf::CondBranchOp>(loc, bcc, trueTarget, std::nullopt,
- falseTarget, std::nullopt);
+ builder->create<mlir::cf::CondBranchOp>(loc, bcc, trueTarget,
+ mlir::ValueRange{}, falseTarget,
+ mlir::ValueRange{});
}
void genConditionalBranch(mlir::Value cond,
Fortran::lower::pft::Evaluation *trueTarget,
@@ -2556,8 +2557,8 @@ private:
builder->setInsertionPointToEnd(loopWrapperOp.getBody());
auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
loc, nestLBs, nestUBs, nestSts, /*loopAnnotation=*/nullptr,
- /*local_vars=*/std::nullopt,
- /*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt,
+ /*local_vars=*/mlir::ValueRange{},
+ /*local_syms=*/nullptr, /*reduce_vars=*/mlir::ValueRange{},
/*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr,
/*reduce_attrs=*/nullptr);
@@ -3810,9 +3811,9 @@ private:
mlir::Block *selectCaseBlock = insertBlock(blockList[0]);
mlir::Block *assumedSizeBlock =
rankStarBlock ? rankStarBlock : defaultBlock;
- builder->create<mlir::cf::CondBranchOp>(loc, isAssumedSize,
- assumedSizeBlock, std::nullopt,
- selectCaseBlock, std::nullopt);
+ builder->create<mlir::cf::CondBranchOp>(
+ loc, isAssumedSize, assumedSizeBlock, mlir::ValueRange{},
+ selectCaseBlock, mlir::ValueRange{});
startBlock(selectCaseBlock);
}
// Create fir.select_case for the other rank cases.
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index 1850b67..b8ab5d0 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -303,7 +303,7 @@ createStringLitOp(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::NamedAttribute sizeAttr(sizeTag, builder.getI64IntegerAttr(len));
llvm::SmallVector<mlir::NamedAttribute> attrs = {dataAttr, sizeAttr};
return builder.create<fir::StringLitOp>(
- loc, llvm::ArrayRef<mlir::Type>{type}, std::nullopt, attrs);
+ loc, llvm::ArrayRef<mlir::Type>{type}, mlir::ValueRange{}, attrs);
}
}
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 0a1cd67..281ab22 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -1003,9 +1003,9 @@ public:
},
[&](const fir::MutableBoxValue &toBox) {
if (toBox.isPointer()) {
- Fortran::lower::associateMutableBox(converter, loc, toBox, expr,
- /*lbounds=*/std::nullopt,
- stmtCtx);
+ Fortran::lower::associateMutableBox(
+ converter, loc, toBox, expr,
+ /*lbounds=*/mlir::ValueRange{}, stmtCtx);
return;
}
// For allocatable components, a deep copy is needed.
@@ -3604,8 +3604,9 @@ public:
mlir::Value castTo =
builder.createConvert(loc, fir::HeapType::get(seqTy), load);
mlir::Value shapeOp = builder.genShape(loc, shape);
- return builder.create<fir::ArrayLoadOp>(
- loc, seqTy, castTo, shapeOp, /*slice=*/mlir::Value{}, std::nullopt);
+ return builder.create<fir::ArrayLoadOp>(loc, seqTy, castTo, shapeOp,
+ /*slice=*/mlir::Value{},
+ mlir::ValueRange{});
};
// Custom lowering of the element store to deal with the extra indirection
// to the lazy allocated buffer.
@@ -4207,7 +4208,7 @@ private:
auto addr =
builder->create<fir::ArrayCoorOp>(loc, eleRefTy, tmp, shape,
/*slice=*/mlir::Value{}, indices,
- /*typeParams=*/std::nullopt);
+ /*typeParams=*/mlir::ValueRange{});
auto load = builder->create<fir::LoadOp>(loc, addr);
return builder->createConvert(loc, i1Ty, load);
};
@@ -4522,17 +4523,18 @@ private:
fir::isRecordWithAllocatableMember(eleTy))
TODO(loc, "creating an array temp where the element type has "
"allocatable members");
- mlir::Value temp = !seqTy.hasDynamicExtents()
- ? builder.create<fir::AllocMemOp>(loc, type)
- : builder.create<fir::AllocMemOp>(
- loc, type, ".array.expr", std::nullopt, shape);
+ mlir::Value temp =
+ !seqTy.hasDynamicExtents()
+ ? builder.create<fir::AllocMemOp>(loc, type)
+ : builder.create<fir::AllocMemOp>(loc, type, ".array.expr",
+ mlir::ValueRange{}, shape);
fir::FirOpBuilder *bldr = &converter.getFirOpBuilder();
stmtCtx.attachCleanup(
[bldr, loc, temp]() { bldr->create<fir::FreeMemOp>(loc, temp); });
mlir::Value shapeOp = genShapeOp(shape);
return builder.create<fir::ArrayLoadOp>(loc, seqTy, temp, shapeOp,
/*slice=*/mlir::Value{},
- std::nullopt);
+ mlir::ValueRange{});
}
static fir::ShapeOp genShapeOp(mlir::Location loc, fir::FirOpBuilder &builder,
@@ -6483,7 +6485,7 @@ private:
mlir::Value initBuffSz =
builder.createIntegerConstant(loc, idxTy, clInitialBufferSize);
mem = builder.create<fir::AllocMemOp>(
- loc, eleTy, /*typeparams=*/std::nullopt, initBuffSz);
+ loc, eleTy, /*typeparams=*/mlir::ValueRange{}, initBuffSz);
builder.create<fir::StoreOp>(loc, initBuffSz, buffSize);
}
} else {
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index 2be5ef7..5f73335 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -134,7 +134,7 @@ void Fortran::lower::genFailImageStatement(
mlir::Location loc = converter.getCurrentLocation();
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(FailImageStatement)>(loc, builder);
- builder.create<fir::CallOp>(loc, callee, std::nullopt);
+ builder.create<fir::CallOp>(loc, callee, mlir::ValueRange{});
genUnreachable(builder, loc);
}
@@ -199,7 +199,7 @@ void Fortran::lower::genPauseStatement(
mlir::Location loc = converter.getCurrentLocation();
mlir::func::FuncOp callee =
fir::runtime::getRuntimeFunc<mkRTKey(PauseStatement)>(loc, builder);
- builder.create<fir::CallOp>(loc, callee, std::nullopt);
+ builder.create<fir::CallOp>(loc, callee, mlir::ValueRange{});
}
void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder,
diff --git a/flang/lib/Lower/VectorSubscripts.cpp b/flang/lib/Lower/VectorSubscripts.cpp
index 389a89d..c7b3e11 100644
--- a/flang/lib/Lower/VectorSubscripts.cpp
+++ b/flang/lib/Lower/VectorSubscripts.cpp
@@ -122,7 +122,7 @@ private:
TODO(loc, "threading length parameters in field index op");
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
componentPath.emplace_back(builder.create<fir::FieldIndexOp>(
- loc, fldTy, componentName, recTy, /*typeParams*/ std::nullopt));
+ loc, fldTy, componentName, recTy, /*typeParams=*/mlir::ValueRange{}));
return fir::unwrapSequenceType(recTy.getType(componentName));
}
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index acd5a88..5b1dbc44 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -620,7 +620,7 @@ fir::StringLitOp fir::FirOpBuilder::createStringLitOp(mlir::Location loc,
mlir::NamedAttribute sizeAttr(sizeTag, getI64IntegerAttr(data.size()));
llvm::SmallVector<mlir::NamedAttribute> attrs{dataAttr, sizeAttr};
return create<fir::StringLitOp>(loc, llvm::ArrayRef<mlir::Type>{type},
- std::nullopt, attrs);
+ mlir::ValueRange{}, attrs);
}
mlir::Value fir::FirOpBuilder::genShape(mlir::Location loc,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 773d640..04703f7 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -59,7 +59,8 @@ mlir::Value fir::runtime::genCpuTime(fir::FirOpBuilder &builder,
mlir::Location loc) {
mlir::func::FuncOp func =
fir::runtime::getRuntimeFunc<mkRTKey(CpuTime)>(loc, builder);
- return builder.create<fir::CallOp>(loc, func, std::nullopt).getResult(0);
+ return builder.create<fir::CallOp>(loc, func, mlir::ValueRange{})
+ .getResult(0);
}
void fir::runtime::genDateAndTime(fir::FirOpBuilder &builder,
@@ -280,7 +281,8 @@ void fir::runtime::genRename(fir::FirOpBuilder &builder, mlir::Location loc,
mlir::Value fir::runtime::genTime(fir::FirOpBuilder &builder,
mlir::Location loc) {
auto func = fir::runtime::getRuntimeFunc<mkRTKey(time)>(loc, builder);
- return builder.create<fir::CallOp>(loc, func, std::nullopt).getResult(0);
+ return builder.create<fir::CallOp>(loc, func, mlir::ValueRange{})
+ .getResult(0);
}
/// generate runtime call to transfer intrinsic with no size argument
diff --git a/flang/lib/Optimizer/Builder/Runtime/Stop.cpp b/flang/lib/Optimizer/Builder/Runtime/Stop.cpp
index 411181c..9b5e43b 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Stop.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Stop.cpp
@@ -25,7 +25,7 @@ void fir::runtime::genExit(fir::FirOpBuilder &builder, mlir::Location loc,
void fir::runtime::genAbort(fir::FirOpBuilder &builder, mlir::Location loc) {
mlir::func::FuncOp abortFunc =
fir::runtime::getRuntimeFunc<mkRTKey(Abort)>(loc, builder);
- builder.create<fir::CallOp>(loc, abortFunc, std::nullopt);
+ builder.create<fir::CallOp>(loc, abortFunc, mlir::ValueRange{});
}
void fir::runtime::genReportFatalUserError(fir::FirOpBuilder &builder,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index 03cc92e..c5cf01e 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -405,7 +405,7 @@ void OrderedAssignmentRewriter::pre(hlfir::ForallMaskOp forallMaskOp) {
mlir::Location loc = forallMaskOp.getLoc();
mlir::Value mask = generateYieldedScalarValue(forallMaskOp.getMaskRegion(),
builder.getI1Type());
- auto ifOp = builder.create<fir::IfOp>(loc, std::nullopt, mask, false);
+ auto ifOp = builder.create<fir::IfOp>(loc, mlir::TypeRange{}, mask, false);
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
constructStack.push_back(ifOp);
}
@@ -530,7 +530,7 @@ void OrderedAssignmentRewriter::generateMaskIfOp(mlir::Value cdt) {
mlir::Location loc = cdt.getLoc();
cdt = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{cdt});
cdt = builder.createConvert(loc, builder.getI1Type(), cdt);
- auto ifOp = builder.create<fir::IfOp>(cdt.getLoc(), std::nullopt, cdt,
+ auto ifOp = builder.create<fir::IfOp>(cdt.getLoc(), mlir::TypeRange{}, cdt,
/*withElseRegion=*/false);
constructStack.push_back(ifOp.getOperation());
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
diff --git a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp
index 1f8edf8..bc4fcd8 100644
--- a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp
+++ b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp
@@ -222,7 +222,7 @@ void AllocaReplaceImpl::genIndirectDeallocation(
rewriter.create<fir::ConvertOp>(loc, intPtrTy, ptrVal);
mlir::Value isAllocated = rewriter.create<mlir::arith::CmpIOp>(
loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0);
- auto ifOp = rewriter.create<fir::IfOp>(loc, std::nullopt, isAllocated,
+ auto ifOp = rewriter.create<fir::IfOp>(loc, mlir::TypeRange{}, isAllocated,
/*withElseRegion=*/false);
rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
mlir::Value cast = fir::factory::createConvert(
diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
index 57eae1f..6e45aae 100644
--- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp
@@ -456,7 +456,7 @@ llvm::LogicalResult SelectTypeConv::genTypeLadderStep(
rewriter.setInsertionPointToEnd(thisBlock);
if (destOps.has_value())
rewriter.create<mlir::cf::CondBranchOp>(loc, cmp, dest, destOps.value(),
- newBlock, std::nullopt);
+ newBlock, mlir::ValueRange{});
else
rewriter.create<mlir::cf::CondBranchOp>(loc, cmp, dest, newBlock);
rewriter.setInsertionPointToEnd(newBlock);
diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
index 506c8e6..ad8464b 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
@@ -261,7 +261,7 @@ public:
innermostUnorderdLoop = rewriter.create<fir::DoLoopOp>(
doConcurentOp.getLoc(), lb, ub, st,
/*unordred=*/true, /*finalCountValue=*/false,
- /*iterArgs=*/std::nullopt, loop.getReduceVars(),
+ /*iterArgs=*/mlir::ValueRange{}, loop.getReduceVars(),
loop.getReduceAttrsAttr());
ivArgs.push_back(innermostUnorderdLoop.getInductionVar());
rewriter.setInsertionPointToStart(innermostUnorderdLoop.getBody());
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index cc1d4bf..e258df8 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -1180,7 +1180,9 @@ void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) {
void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
const auto &control{std::get<parser::IoImpliedDoControl>(outputImpliedDo.t)};
const parser::Name &name{control.name.thing.thing};
- context_.CheckIndexVarRedefine(name.source, *name.symbol);
+ if (name.symbol) {
+ context_.CheckIndexVarRedefine(name.source, *name.symbol);
+ }
}
void DoForallChecker::Leave(const parser::StatVariable &statVariable) {
diff --git a/flang/test/Semantics/PowerPC/ppc-vector-types01.f90 b/flang/test/Semantics/PowerPC/ppc-vector-types01.f90
index ad69b69..ea54a00 100644
--- a/flang/test/Semantics/PowerPC/ppc-vector-types01.f90
+++ b/flang/test/Semantics/PowerPC/ppc-vector-types01.f90
@@ -1,7 +1,7 @@
! RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s
! REQUIRES: target=powerpc{{.*}}
- ! CHECK-LABEL: PROGRAM ppc_vec_unit
+ ! CHECK-LABEL: PROGRAM PPC_VEC_UNIT
program ppc_vec_unit
implicit none
! CHECK: VECTOR(INTEGER(KIND=4_4)) :: vi1, vi2
diff --git a/flang/test/Semantics/PowerPC/ppc-vector-types02.f90 b/flang/test/Semantics/PowerPC/ppc-vector-types02.f90
index 8c96684..175b5868 100644
--- a/flang/test/Semantics/PowerPC/ppc-vector-types02.f90
+++ b/flang/test/Semantics/PowerPC/ppc-vector-types02.f90
@@ -2,7 +2,7 @@
! REQUIRES: target=powerpc{{.*}}
! C: MainProgram scope: ppc_vec_types
-! CHECK-LABEL: MainProgram scope: ppc_vec_types size={{[0-9]*}} alignment={{[0-9]*}}
+! CHECK-LABEL: MainProgram scope: PPC_VEC_TYPES size={{[0-9]*}} alignment={{[0-9]*}}
program ppc_vec_types
implicit none
vector(integer(4)) :: vi
diff --git a/flang/test/Semantics/resolve40.f90 b/flang/test/Semantics/resolve40.f90
index a91507a..81bb5f9 100644
--- a/flang/test/Semantics/resolve40.f90
+++ b/flang/test/Semantics/resolve40.f90
@@ -96,3 +96,10 @@ subroutine s12(x)
!BECAUSE: 'x' is an INTENT(IN) dummy argument
read(*,nml=nl)
end
+
+subroutine s13()
+ implicit none
+ !ERROR: No explicit type declared for 'i'
+ !ERROR: No explicit type declared for 'i'
+ print *, (i, i = 1, 2)
+end
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index 98270ad..5980877 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -48,7 +48,7 @@ TEST_F(FortranVariableTest, SimpleScalar) {
mlir::Value addr = builder->create<fir::AllocaOp>(loc, eleType);
auto name = mlir::StringAttr::get(&context, "x");
auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
- /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt,
+ /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{},
/*dummy_scope=*/nullptr, name,
/*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
/*data_attr=*/cuf::DataAttributeAttr{});
@@ -102,11 +102,11 @@ TEST_F(FortranVariableTest, SimpleArray) {
extents.size(), fir::SequenceType::getUnknownExtent());
mlir::Type seqTy = fir::SequenceType::get(typeShape, eleType);
mlir::Value addr = builder->create<fir::AllocaOp>(
- loc, seqTy, /*pinned=*/false, /*typeParams=*/std::nullopt, extents);
+ loc, seqTy, /*pinned=*/false, /*typeParams=*/mlir::ValueRange{}, extents);
mlir::Value shape = createShape(extents);
auto name = mlir::StringAttr::get(&context, "x");
auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
- shape, /*typeParams*/ std::nullopt, /*dummy_scope=*/nullptr, name,
+ shape, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr, name,
/*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
/*data_attr=*/cuf::DataAttributeAttr{});
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index e210992..3fb6278 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -571,6 +571,8 @@ function(add_integration_test test_name)
target_compile_options(${fq_build_target_name} PRIVATE
${compile_options} ${INTEGRATION_TEST_COMPILE_OPTIONS})
+ set(compiler_runtime "")
+
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
target_link_options(${fq_build_target_name} PRIVATE
${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS}
@@ -599,17 +601,19 @@ function(add_integration_test test_name)
set(link_options
-nolibc
-nostartfiles
- -static
+ -nostdlib
${LIBC_LINK_OPTIONS_DEFAULT}
${LIBC_TEST_LINK_OPTIONS_DEFAULT}
)
target_link_options(${fq_build_target_name} PRIVATE ${link_options})
+ list(APPEND compiler_runtime ${LIBGCC_S_LOCATION})
endif()
target_link_libraries(
${fq_build_target_name}
- ${fq_target_name}.__libc__
libc.startup.${LIBC_TARGET_OS}.crt1
libc.test.IntegrationTest.test
+ ${fq_target_name}.__libc__
+ ${compiler_runtime}
)
add_dependencies(${fq_build_target_name}
libc.test.IntegrationTest.test
@@ -770,6 +774,7 @@ function(add_libc_hermetic test_name)
${HERMETIC_TEST_COMPILE_OPTIONS})
set(link_libraries "")
+ set(compiler_runtime "")
foreach(lib IN LISTS HERMETIC_TEST_LINK_LIBRARIES)
if(TARGET ${lib}.hermetic)
list(APPEND link_libraries ${lib}.hermetic)
@@ -807,12 +812,12 @@ function(add_libc_hermetic test_name)
set(link_options
-nolibc
-nostartfiles
- -static
+ -nostdlib
${LIBC_LINK_OPTIONS_DEFAULT}
${LIBC_TEST_LINK_OPTIONS_DEFAULT}
)
target_link_options(${fq_build_target_name} PRIVATE ${link_options})
- list(APPEND link_libraries ${LIBGCC_S_LOCATION})
+ list(APPEND compiler_runtime ${LIBGCC_S_LOCATION})
endif()
target_link_libraries(
${fq_build_target_name}
@@ -820,7 +825,9 @@ function(add_libc_hermetic test_name)
libc.startup.${LIBC_TARGET_OS}.crt1
${link_libraries}
LibcHermeticTestSupport.hermetic
- ${fq_target_name}.__libc__)
+ ${fq_target_name}.__libc__
+ ${compiler_runtime}
+ )
add_dependencies(${fq_build_target_name}
LibcTest.hermetic
libc.test.UnitTest.ErrnoSetterMatcher
diff --git a/libc/include/dirent.h.def b/libc/include/dirent.h.def
deleted file mode 100644
index 6786578..0000000
--- a/libc/include/dirent.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- POSIX header dirent.h ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_DIRENT_H
-#define LLVM_LIBC_DIRENT_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_DIRENT_H
diff --git a/libc/include/dirent.yaml b/libc/include/dirent.yaml
index 3fc522f..66570bc 100644
--- a/libc/include/dirent.yaml
+++ b/libc/include/dirent.yaml
@@ -1,47 +1,45 @@
header: dirent.h
-header_template: dirent.h.def
-macros: []
+standards:
+ - posix
types:
- type_name: struct_dirent
- type_name: DIR
- type_name: ino_t
-enums: []
-objects: []
functions:
- name: alphasort
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: const struct dirent **
- type: const struct dirent **
- name: closedir
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: DIR *
- name: dirfd
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: DIR *
- name: fdopendir
standards:
- - POSIX
+ - posix
return_type: DIR *
arguments:
- type: int
- name: opendir
standards:
- - POSIX
+ - posix
return_type: DIR *
arguments:
- type: const char *
- name: readdir
standards:
- - POSIX
+ - posix
return_type: struct dirent *
arguments:
- type: DIR *
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 2f05d75..6697ce5 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -50,4 +50,105 @@
#define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
#endif
+// POSIX math constants
+// https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html
+#define M_E (__extension__ 0x1.5bf0a8b145769p1)
+#define M_EGAMMA (__extension__ 0x1.2788cfc6fb619p-1)
+#define M_LOG2E (__extension__ 0x1.71547652b82fep0)
+#define M_LOG10E (__extension__ 0x1.bcb7b1526e50ep-2)
+#define M_LN2 (__extension__ 0x1.62e42fefa39efp-1)
+#define M_LN10 (__extension__ 0x1.26bb1bbb55516p1)
+#define M_PHI (__extension__ 0x1.9e3779b97f4a8p0)
+#define M_PI (__extension__ 0x1.921fb54442d18p1)
+#define M_PI_2 (__extension__ 0x1.921fb54442d18p0)
+#define M_PI_4 (__extension__ 0x1.921fb54442d18p-1)
+#define M_1_PI (__extension__ 0x1.45f306dc9c883p-2)
+#define M_1_SQRTPI (__extension__ 0x1.20dd750429b6dp-1)
+#define M_2_PI (__extension__ 0x1.45f306dc9c883p-1)
+#define M_2_SQRTPI (__extension__ 0x1.20dd750429b6dp0)
+#define M_SQRT2 (__extension__ 0x1.6a09e667f3bcdp0)
+#define M_SQRT3 (__extension__ 0x1.bb67ae8584caap0)
+#define M_SQRT1_2 (__extension__ 0x1.6a09e667f3bcdp-1)
+#define M_SQRT1_3 (__extension__ 0x1.279a74590331cp-1)
+
+#define M_Ef (__extension__ 0x1.5bf0a8p1f)
+#define M_EGAMMAf (__extension__ 0x1.2788dp-1f)
+#define M_LOG2Ef (__extension__ 0x1.715476p0f)
+#define M_LOG10Ef (__extension__ 0x1.bcb7b2p-2f)
+#define M_LN2f (__extension__ 0x1.62e43p-1f)
+#define M_LN10f (__extension__ 0x1.26bb1cp1f)
+#define M_PHIf (__extension__ 0x1.9e377ap0f)
+#define M_PIf (__extension__ 0x1.921fb6p1f)
+#define M_PI_2f (__extension__ 0x1.921fb6p0f)
+#define M_PI_4f (__extension__ 0x1.921fb6p-1f)
+#define M_1_PIf (__extension__ 0x1.45f306p-2f)
+#define M_1_SQRTPIf (__extension__ 0x1.20dd76p-1f)
+#define M_2_PIf (__extension__ 0x1.45f306p-1f)
+#define M_2_SQRTPIf (__extension__ 0x1.20dd76p0f)
+#define M_SQRT2f (__extension__ 0x1.6a09e6p0f)
+#define M_SQRT3f (__extension__ 0x1.bb67aep0f)
+#define M_SQRT1_2f (__extension__ 0x1.6a09e6p-1f)
+#define M_SQRT1_3f (__extension__ 0x1.279a74p-1f)
+
+#define M_El (__extension__ 0x1.5bf0a8b1457695355fb8ac404e7ap1L)
+#define M_EGAMMAl (__extension__ 0x1.2788cfc6fb618f49a37c7f0202a6p-1L)
+#define M_LOG2El (__extension__ 0x1.71547652b82fe1777d0ffda0d23ap0L)
+#define M_LOG10El (__extension__ 0x1.bcb7b1526e50e32a6ab7555f5a68p-2L)
+#define M_LN2l (__extension__ 0x1.62e42fefa39ef35793c7673007e6p-1L)
+#define M_LN10l (__extension__ 0x1.26bb1bbb5551582dd4adac5705a6p1L)
+#define M_PHIl (__extension__ 0x1.9e3779b97f4a7c15f39cc0605ceep0L)
+#define M_PIl (__extension__ 0x1.921fb54442d18469898cc51701b8p1L)
+#define M_PI_2l (__extension__ 0x1.921fb54442d18469898cc51701b8p0L)
+#define M_PI_4l (__extension__ 0x1.921fb54442d18469898cc51701b8p-1L)
+#define M_1_PIl (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-2L)
+#define M_1_SQRTPIl (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep-1L)
+#define M_2_PIl (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-1L)
+#define M_2_SQRTPIl (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep0L)
+#define M_SQRT2l (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p0L)
+#define M_SQRT3l (__extension__ 0x1.bb67ae8584caa73b25742d7078b8p0L)
+#define M_SQRT1_2l (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p-1L)
+#define M_SQRT1_3l (__extension__ 0x1.279a74590331c4d218f81e4afb25p-1L)
+
+#ifdef __FLT16_MANT_DIG__
+#define M_Ef16 (__extension__ 0x1.5cp1f16)
+#define M_EGAMMAf16 (__extension__ 0x1.278p-1f16)
+#define M_LOG2Ef16 (__extension__ 0x1.714f16)
+#define M_LOG10Ef16 (__extension__ 0x1.bccp-2f16)
+#define M_LN2f16 (__extension__ 0x1.63p-1f16)
+#define M_LN10f16 (__extension__ 0x1.26cp1f16)
+#define M_PHIf16 (__extension__ 0x1.9e4p0f16)
+#define M_PIf16 (__extension__ 0x1.92p1f16)
+#define M_PI_2f16 (__extension__ 0x1.92p0f16)
+#define M_PI_4f16 (__extension__ 0x1.92p-1f16)
+#define M_1_PIf16 (__extension__ 0x1.46p-2f16)
+#define M_1_SQRTPIf16 (__extension__ 0x1.20cp-1f16)
+#define M_2_PIf16 (__extension__ 0x1.46p-1f16)
+#define M_2_SQRTPIf16 (__extension__ 0x1.20cp0f16)
+#define M_SQRT2f16 (__extension__ 0x1.6ap0f16)
+#define M_SQRT3f16 (__extension__ 0x1.bb8p0f16)
+#define M_SQRT1_2f16 (__extension__ 0x1.6ap-1f16)
+#define M_SQRT1_3f16 (__extension__ 0x1.278p-1f16)
+#endif // __FLT16_MANT_DIG__
+
+#ifdef __SIZEOF_FLOAT128__
+#define M_Ef128 (__extension__ 0x1.5bf0a8b1457695355fb8ac404e7ap1q)
+#define M_EGAMMAf128 (__extension__ 0x1.2788cfc6fb618f49a37c7f0202a6p-1q)
+#define M_LOG2Ef128 (__extension__ 0x1.71547652b82fe1777d0ffda0d23ap0q)
+#define M_LOG10Ef128 (__extension__ 0x1.bcb7b1526e50e32a6ab7555f5a68p-2q)
+#define M_LN2f128 (__extension__ 0x1.62e42fefa39ef35793c7673007e6p-1q)
+#define M_LN10f128 (__extension__ 0x1.26bb1bbb5551582dd4adac5705a6p1q)
+#define M_PHIf128 (__extension__ 0x1.9e3779b97f4a7c15f39cc0605ceep0q)
+#define M_PIf128 (__extension__ 0x1.921fb54442d18469898cc51701b8p1q)
+#define M_PI_2f128 (__extension__ 0x1.921fb54442d18469898cc51701b8p0q)
+#define M_PI_4f128 (__extension__ 0x1.921fb54442d18469898cc51701b8p-1q)
+#define M_1_PIf128 (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-2q)
+#define M_1_SQRTPIf128 (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep-1q)
+#define M_2_PIf128 (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-1q)
+#define M_2_SQRTPIf128 (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep0q)
+#define M_SQRT2f128 (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p0q)
+#define M_SQRT3f128 (__extension__ 0x1.bb67ae8584caa73b25742d7078b8p0q)
+#define M_SQRT1_2f128 (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p-1q)
+#define M_SQRT1_3f128 (__extension__ 0x1.279a74590331c4d218f81e4afb25p-1q)
+#endif // __SIZEOF_FLOAT128__
+
#endif // LLVM_LIBC_MACROS_MATH_MACROS_H
diff --git a/libc/include/search.h.def b/libc/include/search.h.def
deleted file mode 100644
index 6301ba7..0000000
--- a/libc/include/search.h.def
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- POSIX header search.h ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SEARCH_H
-#define LLVM_LIBC_SEARCH_H
-
-#include "__llvm-libc-common.h"
-#define __need_size_t
-#include <stddef.h>
-
-%%public_api()
-
-#endif // LLVM_LIBC_SEARCH_H
diff --git a/libc/include/search.yaml b/libc/include/search.yaml
index e0247af..8a3a0c5 100644
--- a/libc/include/search.yaml
+++ b/libc/include/search.yaml
@@ -1,6 +1,6 @@
header: search.h
-header_template: search.h.def
-macros: []
+standards:
+ - posix
types:
- type_name: ACTION
- type_name: ENTRY
@@ -12,35 +12,35 @@ objects: []
functions:
- name: hcreate
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: size_t
- name: hcreate_r
- standards: GNUExtensions
+ standards: gnu
return_type: int
arguments:
- type: size_t
- type: struct hsearch_data *
- name: hdestroy
- standards: GNUExtensions
+ standards: gnu
return_type: void
arguments: []
- name: hdestroy_r
standards:
- - POSIX
+ - posix
return_type: void
arguments:
- type: struct hsearch_data *
- name: hsearch
standards:
- - POSIX
+ - posix
return_type: ENTRY *
arguments:
- type: ENTRY
- type: ACTION
- name: hsearch_r
- standards: GNUExtensions
+ standards: gnu
return_type: int
arguments:
- type: ENTRY
@@ -49,20 +49,20 @@ functions:
- type: struct hsearch_data *
- name: insque
standards:
- - POSIX
+ - posix
return_type: void
arguments:
- type: void *
- type: void *
- name: remque
standards:
- - POSIX
+ - posix
return_type: void
arguments:
- type: void *
- name: lfind
standards:
- - POSIX
+ - posix
return_type: void *
arguments:
- type: const void *
@@ -72,7 +72,7 @@ functions:
- type: __search_compare_t
- name: lsearch
standards:
- - POSIX
+ - posix
return_type: void *
arguments:
- type: const void *
diff --git a/libc/include/setjmp.h.def b/libc/include/setjmp.h.def
deleted file mode 100644
index 670bc1a..0000000
--- a/libc/include/setjmp.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header setjmp.h --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SETJMP_H
-#define LLVM_LIBC_SETJMP_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SETJMP_H
diff --git a/libc/include/setjmp.yaml b/libc/include/setjmp.yaml
index 00049e5..55e0347 100644
--- a/libc/include/setjmp.yaml
+++ b/libc/include/setjmp.yaml
@@ -1,10 +1,8 @@
header: setjmp.h
-header_template: setjmp.h.def
-macros: []
+standards:
+ - stdc
types:
- type_name: jmp_buf
-enums: []
-objects: []
functions:
- name: longjmp
standards:
@@ -23,7 +21,7 @@ functions:
- type: jmp_buf
- name: sigsetjmp
standards:
- - POSIX
+ - posix
return_type: int
attributes:
- _Returns_twice
@@ -32,7 +30,7 @@ functions:
- type: int
- name: siglongjmp
standards:
- - POSIX
+ - posix
return_type: _Noreturn void
arguments:
- type: sigjmp_buf
diff --git a/libc/include/spawn.h.def b/libc/include/spawn.h.def
deleted file mode 100644
index a8d7015..0000000
--- a/libc/include/spawn.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- POSIX header spawn.h ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SPAWN_H
-#define LLVM_LIBC_SPAWN_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SPAWN_H
diff --git a/libc/include/spawn.yaml b/libc/include/spawn.yaml
index c763cc7..ef39f66 100644
--- a/libc/include/spawn.yaml
+++ b/libc/include/spawn.yaml
@@ -1,17 +1,15 @@
header: spawn.h
-header_template: spawn.h.def
-macros: []
+standards:
+ - posix
types:
- type_name: posix_spawn_file_actions_t
- type_name: posix_spawnattr_t
- type_name: pid_t
- type_name: mode_t
-enums: []
-objects: []
functions:
- name: posix_spawn
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: pid_t *__restrict
@@ -22,14 +20,14 @@ functions:
- type: char * const * __restrict
- name: posix_spawn_file_actions_addclose
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: posix_spawn_file_actions_t *
- type: int
- name: posix_spawn_file_actions_adddup2
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: posix_spawn_file_actions_t *
@@ -37,7 +35,7 @@ functions:
- type: int
- name: posix_spawn_file_actions_addopen
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: posix_spawn_file_actions_t *__restrict
@@ -47,13 +45,13 @@ functions:
- type: mode_t
- name: posix_spawn_file_actions_destroy
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: posix_spawn_file_actions_t *
- name: posix_spawn_file_actions_init
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: posix_spawn_file_actions_t *
diff --git a/libc/include/string.h.def b/libc/include/string.h.def
deleted file mode 100644
index 339d005..0000000
--- a/libc/include/string.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header string.h --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_STRING_H
-#define LLVM_LIBC_STRING_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_STRING_H
diff --git a/libc/include/string.yaml b/libc/include/string.yaml
index 736dece..0bf297e 100644
--- a/libc/include/string.yaml
+++ b/libc/include/string.yaml
@@ -1,5 +1,6 @@
header: string.h
-header_template: string.h.def
+standards:
+ - stdc
macros:
- macro_name: NULL
macro_header: null-macro.h
@@ -11,7 +12,7 @@ objects: []
functions:
- name: memccpy
standards:
- - POSIX
+ - posix
return_type: void *
arguments:
- type: void *__restrict
@@ -61,7 +62,7 @@ functions:
- type: size_t
- name: mempcpy
standards:
- - POSIX
+ - posix
return_type: void *
arguments:
- type: void *__restrict
@@ -93,14 +94,14 @@ functions:
- type: size_t
- name: stpcpy
standards:
- - POSIX
+ - posix
return_type: char *
arguments:
- type: char *__restrict
- type: const char *__restrict
- name: stpncpy
standards:
- - POSIX
+ - posix
return_type: char *
arguments:
- type: char *__restrict
@@ -243,7 +244,7 @@ functions:
- type: size_t
- name: strnlen
standards:
- - POSIX
+ - posix
return_type: size_t
arguments:
- type: const char *
@@ -271,7 +272,7 @@ functions:
- type: const char *__restrict
- name: strsignal
standards:
- - POSIX
+ - posix
return_type: char *
arguments:
- type: int
@@ -298,7 +299,7 @@ functions:
- type: const char *__restrict
- name: strtok_r
standards:
- - POSIX
+ - posix
return_type: char *
arguments:
- type: char *__restrict
diff --git a/libc/include/strings.h.def b/libc/include/strings.h.def
deleted file mode 100644
index 9b016bf..0000000
--- a/libc/include/strings.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header strings.h -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_STRINGS_H
-#define LLVM_LIBC_STRINGS_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_STRINGS_H
diff --git a/libc/include/strings.yaml b/libc/include/strings.yaml
index 855800d..1e78f0e 100644
--- a/libc/include/strings.yaml
+++ b/libc/include/strings.yaml
@@ -1,15 +1,14 @@
header: strings.h
-header_template: strings.h.def
-macros: []
+standards:
+ - bsd
+ - posix
types:
- type_name: size_t
- type_name: locale_t
-enums: []
-objects: []
functions:
- name: bcmp
standards:
- - llvm_libc_ext
+ - bsd
return_type: int
arguments:
- type: const void *
@@ -17,7 +16,7 @@ functions:
- type: size_t
- name: bcopy
standards:
- - llvm_libc_ext
+ - bsd
return_type: void
arguments:
- type: const void *
@@ -25,69 +24,61 @@ functions:
- type: size_t
- name: bzero
standards:
- - llvm_libc_ext
+ - bsd
return_type: void
arguments:
- type: void *
- type: size_t
- name: ffs
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: int
- name: ffsl
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: long
- name: ffsll
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: long long
- name: index
standards:
- - BSDExtensions
+ - bsd
return_type: char *
arguments:
- type: const char *
- type: int
- name: rindex
standards:
- - BSDExtensions
+ - bsd
return_type: char *
arguments:
- type: const char *
- type: int
- name: strcasecmp
- standards:
- - BSDExtensions
return_type: int
arguments:
- type: const char *
- type: const char *
- name: strcasecmp_l
- standards:
- - BSDExtensions
return_type: int
arguments:
- type: const char *
- type: const char *
- type: locale_t
- name: strncasecmp
- standards:
- - BSDExtensions
return_type: int
arguments:
- type: const char *
- type: const char *
- type: size_t
- name: strncasecmp_l
- standards:
- - BSDExtensions
return_type: int
arguments:
- type: const char *
diff --git a/libc/include/sys/sendfile.h.def b/libc/include/sys/sendfile.h.def
deleted file mode 100644
index d7f21f9..0000000
--- a/libc/include/sys/sendfile.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Linux sys/sendfile.h ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_SENDFILE_H
-#define LLVM_LIBC_SYS_SENDFILE_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_SENDFILE_H
diff --git a/libc/include/sys/sendfile.yaml b/libc/include/sys/sendfile.yaml
index 259ab83..a845dab 100644
--- a/libc/include/sys/sendfile.yaml
+++ b/libc/include/sys/sendfile.yaml
@@ -1,16 +1,8 @@
header: sys/sendfile.h
-header_template: sendfile.h.def
-macros: []
-types:
- - type_name: ssize_t
- - type_name: size_t
- - type_name: off_t
-enums: []
-objects: []
+standards:
+ - linux
functions:
- name: sendfile
- standards:
- - GNUExtensions
return_type: ssize_t
arguments:
- type: int
diff --git a/libc/include/sys/statvfs.h.def b/libc/include/sys/statvfs.h.def
deleted file mode 100644
index f23c9a3..0000000
--- a/libc/include/sys/statvfs.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- POSIX header statvfs.h --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_STATVFS_H
-#define LLVM_LIBC_SYS_STATVFS_H
-
-#include <__llvm-libc-common.h>
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_STATVFS_H
diff --git a/libc/include/sys/statvfs.yaml b/libc/include/sys/statvfs.yaml
index 8c1d254..e083677 100644
--- a/libc/include/sys/statvfs.yaml
+++ b/libc/include/sys/statvfs.yaml
@@ -1,23 +1,21 @@
header: sys/statvfs.h
-header_template: statvfs.h.def
-macros: []
+standards:
+ - posix
types:
- type_name: struct_statvfs
- type_name: fsblkcnt_t
- type_name: fsfilcnt_t
-enums: []
-objects: []
functions:
- name: fstatvfs
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: int
- type: struct statvfs *
- name: statvfs
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: const char *__restrict
diff --git a/libc/include/sys/types.yaml b/libc/include/sys/types.yaml
index 6fa0b44..a00429d 100644
--- a/libc/include/sys/types.yaml
+++ b/libc/include/sys/types.yaml
@@ -1,32 +1,28 @@
header: sys/types.h
-header_template: types.h.def
-standards: POSIX
-macros: []
+standards:
+ - posix
types:
- - type_name: uid_t
- - type_name: time_t
- - type_name: pthread_t
- - type_name: pthread_rwlock_t
- - type_name: pthread_rwlockattr_t
- - type_name: pthread_mutex_t
- type_name: blkcnt_t
- type_name: blksize_t
- type_name: clockid_t
- - type_name: ssize_t
- - type_name: pthread_mutexattr_t
- - type_name: ino_t
- - type_name: pthread_once_t
- - type_name: mode_t
- type_name: dev_t
- - type_name: pthread_attr_t
- type_name: gid_t
- - type_name: pid_t
+ - type_name: ino_t
+ - type_name: mode_t
- type_name: nlink_t
- - type_name: suseconds_t
- type_name: off_t
- - type_name: size_t
- - type_name: pthread_key_t
+ - type_name: pid_t
+ - type_name: pthread_attr_t
- type_name: pthread_condattr_t
-enums: []
-objects: []
-functions: []
+ - type_name: pthread_key_t
+ - type_name: pthread_mutex_t
+ - type_name: pthread_mutexattr_t
+ - type_name: pthread_once_t
+ - type_name: pthread_rwlock_t
+ - type_name: pthread_rwlockattr_t
+ - type_name: pthread_t
+ - type_name: size_t
+ - type_name: ssize_t
+ - type_name: suseconds_t
+ - type_name: time_t
+ - type_name: uid_t
diff --git a/libc/include/sys/uio.h.def b/libc/include/sys/uio.h.def
deleted file mode 100644
index 76496cb..0000000
--- a/libc/include/sys/uio.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- POSIX header uio.h ------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_UIO_H
-#define LLVM_LIBC_SYS_UIO_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_UIO_H
diff --git a/libc/include/sys/uio.yaml b/libc/include/sys/uio.yaml
index 6d3f336..929911e 100644
--- a/libc/include/sys/uio.yaml
+++ b/libc/include/sys/uio.yaml
@@ -1,15 +1,13 @@
header: sys/uio.h
-header_template: uio.h.def
-macros: []
+standards:
+ - posix
types:
- type_name: struct_iovec
- type_name: ssize_t
-enums: []
-objects: []
functions:
- name: writev
standards:
- - POSIX
+ - posix
return_type: ssize_t
arguments:
- type: int
@@ -17,7 +15,7 @@ functions:
- type: int
- name: readv
standards:
- - POSIX
+ - posix
return_type: ssize_t
arguments:
- type: int
diff --git a/libc/include/sys/utsname.h.def b/libc/include/sys/utsname.h.def
deleted file mode 100644
index 08dbbfc..0000000
--- a/libc/include/sys/utsname.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- Linux sys/utsname.h -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SYS_UTSNAME_H
-#define LLVM_LIBC_SYS_UTSNAME_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_SYS_UTSNAME_H
diff --git a/libc/include/sys/utsname.yaml b/libc/include/sys/utsname.yaml
index 6c7cb71..0f0e4cd 100644
--- a/libc/include/sys/utsname.yaml
+++ b/libc/include/sys/utsname.yaml
@@ -1,14 +1,12 @@
header: sys/utsname.h
-header_template: utsname.h.def
-macros: []
+standards:
+ - posix
types:
- type_name: struct_utsname
-enums: []
-objects: []
functions:
- name: uname
standards:
- - POSIX
+ - posix
return_type: int
arguments:
- type: struct utsname *
diff --git a/libc/include/threads.h.def b/libc/include/threads.h.def
deleted file mode 100644
index b114bea..0000000
--- a/libc/include/threads.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header threads.h -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_THREADS_H
-#define LLVM_LIBC_THREADS_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_THREADS_H
diff --git a/libc/include/threads.yaml b/libc/include/threads.yaml
index 7014822..99b29f1 100644
--- a/libc/include/threads.yaml
+++ b/libc/include/threads.yaml
@@ -1,5 +1,6 @@
header: threads.h
-header_template: threads.h.def
+standards:
+ - stdc
macros:
- macro_name: ONCE_FLAG_INIT
macro_value: '{0}'
diff --git a/libc/include/uchar.h.def b/libc/include/uchar.h.def
deleted file mode 100644
index 31b7fcb..0000000
--- a/libc/include/uchar.h.def
+++ /dev/null
@@ -1,16 +0,0 @@
-//===-- C standard library header uchar.h ---------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_UCHAR_H
-#define LLVM_LIBC_UCHAR_H
-
-#include "__llvm-libc-common.h"
-
-%%public_api()
-
-#endif // LLVM_LIBC_UCHAR_H
diff --git a/libc/include/uchar.yaml b/libc/include/uchar.yaml
index 7139197..d0799e2 100644
--- a/libc/include/uchar.yaml
+++ b/libc/include/uchar.yaml
@@ -1,14 +1,9 @@
header: uchar.h
-header_template: uchar.h.def
standards:
- stdc
-macros: []
types:
- type_name: char32_t
- type_name: char16_t
- type_name: char8_t
- type_name: mbstate_t
- type_name: size_t
-enums: []
-objects: []
-functions: []
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 2ae7c1d..26f69d6 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -14,6 +14,7 @@
#include "math/exp.h"
#include "math/exp10.h"
#include "math/exp10f.h"
+#include "math/exp10f16.h"
#include "math/expf.h"
#include "math/expf16.h"
#include "math/frexpf.h"
diff --git a/libc/shared/math/exp10f16.h b/libc/shared/math/exp10f16.h
new file mode 100644
index 0000000..8acdbdb
--- /dev/null
+++ b/libc/shared/math/exp10f16.h
@@ -0,0 +1,29 @@
+//===-- Shared exp10f16 function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H
+#define LLVM_LIBC_SHARED_MATH_EXP10F_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10f16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10f16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H
diff --git a/libc/src/__support/macros/properties/architectures.h b/libc/src/__support/macros/properties/architectures.h
index c88956f..ecc9319 100644
--- a/libc/src/__support/macros/properties/architectures.h
+++ b/libc/src/__support/macros/properties/architectures.h
@@ -21,7 +21,7 @@
#define LIBC_TARGET_ARCH_IS_GPU
#endif
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#if defined(__CLR_VER) || defined(LIBC_TARGET_ARCH_IS_GPU)
#define LIBC_TARGET_ARCH_IS_VM
#endif
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index ad36679..77a47c6 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -198,3 +198,37 @@ add_header_library(
libc.src.__support.FPUtil.rounding_mode
libc.src.__support.macros.optimization
)
+
+add_header_library(
+ exp10_float16_constants
+ HDRS
+ exp10_float16_constants.h
+ DEPENDS
+ libc.src.__support.CPP.array
+)
+
+add_header_library(
+ exp10f16_utils
+ HDRS
+ exp10f16_utils.h
+ DEPENDS
+ .expf16_utils
+ .exp10_float16_constants
+ libc.src.__support.FPUtil.fp_bits
+)
+
+add_header_library(
+ exp10f16
+ HDRS
+ exp10f16.h
+ DEPENDS
+ .exp10f16_utils
+ libc.src.__support.FPUtil.fp_bits
+ src.__support.FPUtil.FEnvImpl
+ src.__support.FPUtil.FPBits
+ src.__support.FPUtil.cast
+ src.__support.FPUtil.rounding_mode
+ src.__support.FPUtil.except_value_utils
+ src.__support.macros.optimization
+ src.__support.macros.properties.cpu_features
+)
diff --git a/libc/src/__support/math/exp10_float16_constants.h b/libc/src/__support/math/exp10_float16_constants.h
new file mode 100644
index 0000000..f5928db
--- /dev/null
+++ b/libc/src/__support/math/exp10_float16_constants.h
@@ -0,0 +1,43 @@
+//===-- Constants for exp10f16 function -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include <stdint.h>
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/CPP/array.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// Generated by Sollya with the following commands:
+// > display = hexadecimal;
+// > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN));
+static constexpr cpp::array<uint32_t, 8> EXP2_MID_BITS = {
+ 0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U,
+ 0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U,
+};
+
+// Generated by Sollya with the following commands:
+// > display = hexadecimal;
+// > round(log2(10), SG, RN);
+static constexpr float LOG2F_10 = 0x1.a934fp+1f;
+
+// Generated by Sollya with the following commands:
+// > display = hexadecimal;
+// > round(log10(2), SG, RN);
+static constexpr float LOG10F_2 = 0x1.344136p-2f;
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
diff --git a/libc/src/__support/math/exp10f16.h b/libc/src/__support/math/exp10f16.h
new file mode 100644
index 0000000..0d8b125
--- /dev/null
+++ b/libc/src/__support/math/exp10f16.h
@@ -0,0 +1,141 @@
+//===-- Implementation header for exp10f16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "exp10f16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+static constexpr size_t N_EXP10F16_EXCEPTS = 5;
+#else
+static constexpr size_t N_EXP10F16_EXCEPTS = 8;
+#endif
+
+static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
+ EXP10F16_EXCEPTS = {{
+ // x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ)
+ {0x363dU, 0x40e8U, 1U, 0U, 1U},
+ // x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ)
+ {0x3657U, 0x40fbU, 1U, 0U, 0U},
+ // x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ)
+ {0xac06U, 0x3aecU, 1U, 0U, 0U},
+ // x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ)
+ {0xbf0aU, 0x2473U, 1U, 0U, 0U},
+ // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
+ {0xc387U, 0x09a5U, 1U, 0U, 0U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+ // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
+ {0x4030U, 0x57c1U, 1U, 0U, 1U},
+ // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
+ {0x406eU, 0x591fU, 1U, 0U, 1U},
+ // x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ)
+ {0x446eU, 0x7690U, 1U, 0U, 1U},
+#endif
+ }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+static constexpr float16 exp10f16(float16 x) {
+ using FPBits = fputil::FPBits<float16>;
+ FPBits x_bits(x);
+
+ uint16_t x_u = x_bits.uintval();
+ uint16_t x_abs = x_u & 0x7fffU;
+
+ // When |x| >= 5, or x is NaN.
+ if (LIBC_UNLIKELY(x_abs >= 0x4500U)) {
+ // exp10(NaN) = NaN
+ if (x_bits.is_nan()) {
+ if (x_bits.is_signaling_nan()) {
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits::quiet_nan().get_val();
+ }
+
+ return x;
+ }
+
+ // When x >= 5.
+ if (x_bits.is_pos()) {
+ // exp10(+inf) = +inf
+ if (x_bits.is_inf())
+ return FPBits::inf().get_val();
+
+ switch (fputil::quick_get_round()) {
+ case FE_TONEAREST:
+ case FE_UPWARD:
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_OVERFLOW);
+ return FPBits::inf().get_val();
+ default:
+ return FPBits::max_normal().get_val();
+ }
+ }
+
+ // When x <= -8.
+ if (x_u >= 0xc800U) {
+ // exp10(-inf) = +0
+ if (x_bits.is_inf())
+ return FPBits::zero().get_val();
+
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
+
+ if (fputil::fenv_is_round_up())
+ return FPBits::min_subnormal().get_val();
+ return FPBits::zero().get_val();
+ }
+ }
+
+ // When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results.
+ if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
+ switch (x_u) {
+ case 0x3c00U: // x = 1.0f16
+ return fputil::cast<float16>(10.0);
+ case 0x4000U: // x = 2.0f16
+ return fputil::cast<float16>(100.0);
+ case 0x4200U: // x = 3.0f16
+ return fputil::cast<float16>(1'000.0);
+ case 0x4400U: // x = 4.0f16
+ return fputil::cast<float16>(10'000.0);
+ }
+ }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+ if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+ return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+ // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
+ auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+ return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H
diff --git a/libc/src/__support/math/exp10f16_utils.h b/libc/src/__support/math/exp10f16_utils.h
new file mode 100644
index 0000000..bffb81b
--- /dev/null
+++ b/libc/src/__support/math/exp10f16_utils.h
@@ -0,0 +1,64 @@
+//===-- Common utils for exp10f16 -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "exp10_float16_constants.h"
+#include "expf16_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LIBC_INLINE static constexpr ExpRangeReduction
+exp10_range_reduction(float16 x) {
+ // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
+ // find hi, mid, lo, such that:
+ // x = (hi + mid) * log2(10) + lo, in which
+ // hi is an integer,
+ // mid * 2^3 is an integer,
+ // -2^(-4) <= lo < 2^(-4).
+ // In particular,
+ // hi + mid = round(x * 2^3) * 2^(-3).
+ // Then,
+ // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
+ // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
+ // by adding hi to the exponent field of 2^mid. 10^lo is computed using a
+ // degree-4 minimax polynomial generated by Sollya.
+
+ float xf = x;
+ float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
+ int x_hi_mid = static_cast<int>(kf);
+ unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3;
+ unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7;
+ // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
+ float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
+
+ uint32_t exp2_hi_mid_bits =
+ EXP2_MID_BITS[x_mid] +
+ static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
+ float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
+ // Degree-4 minimax polynomial generated by Sollya with the following
+ // commands:
+ // > display = hexadecimal;
+ // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
+ // > 1 + x * P;
+ float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
+ 0x1.04b434p+1f, 0x1.2bcf9ep+0f);
+ return {exp2_hi_mid, exp10_lo};
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 99db743..fb253a4 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1477,20 +1477,8 @@ add_entrypoint_object(
HDRS
../exp10f16.h
DEPENDS
- .expxf16
- libc.hdr.errno_macros
- libc.hdr.fenv_macros
- libc.src.__support.CPP.array
- libc.src.__support.FPUtil.cast
- libc.src.__support.FPUtil.except_value_utils
- libc.src.__support.FPUtil.fenv_impl
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.FPUtil.multiply_add
- libc.src.__support.FPUtil.nearest_integer
- libc.src.__support.FPUtil.polyeval
- libc.src.__support.FPUtil.rounding_mode
- libc.src.__support.macros.optimization
- libc.src.__support.macros.properties.cpu_features
+ libc.src.__support.math.exp10f16
+ libc.src.errno.errno
)
add_entrypoint_object(
@@ -1519,7 +1507,6 @@ add_entrypoint_object(
HDRS
../exp10m1f16.h
DEPENDS
- .expxf16
libc.hdr.errno_macros
libc.hdr.fenv_macros
libc.src.__support.FPUtil.cast
@@ -1531,6 +1518,7 @@ add_entrypoint_object(
libc.src.__support.FPUtil.rounding_mode
libc.src.__support.macros.optimization
libc.src.__support.macros.properties.cpu_features
+ libc.src.__support.math.exp10f16_utils
)
add_entrypoint_object(
@@ -5023,10 +5011,11 @@ add_header_library(
HDRS
expxf16.h
DEPENDS
- libc.src.__support.FPUtil.cast
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.cast
libc.src.__support.FPUtil.multiply_add
libc.src.__support.FPUtil.nearest_integer
libc.src.__support.macros.attributes
libc.src.__support.math.expf16_utils
+ libc.src.__support.math.exp10_float16_constants
)
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 31abf3b..cb3c859 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -7,128 +7,10 @@
//===----------------------------------------------------------------------===//
#include "src/math/exp10f16.h"
-#include "expxf16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/CPP/array.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/exp10f16.h"
namespace LIBC_NAMESPACE_DECL {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-static constexpr size_t N_EXP10F16_EXCEPTS = 5;
-#else
-static constexpr size_t N_EXP10F16_EXCEPTS = 8;
-#endif
-
-static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
- EXP10F16_EXCEPTS = {{
- // x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ)
- {0x363dU, 0x40e8U, 1U, 0U, 1U},
- // x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ)
- {0x3657U, 0x40fbU, 1U, 0U, 0U},
- // x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ)
- {0xac06U, 0x3aecU, 1U, 0U, 0U},
- // x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ)
- {0xbf0aU, 0x2473U, 1U, 0U, 0U},
- // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
- {0xc387U, 0x09a5U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
- // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
- {0x4030U, 0x57c1U, 1U, 0U, 1U},
- // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
- {0x406eU, 0x591fU, 1U, 0U, 1U},
- // x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ)
- {0x446eU, 0x7690U, 1U, 0U, 1U},
-#endif
- }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
- using FPBits = fputil::FPBits<float16>;
- FPBits x_bits(x);
-
- uint16_t x_u = x_bits.uintval();
- uint16_t x_abs = x_u & 0x7fffU;
-
- // When |x| >= 5, or x is NaN.
- if (LIBC_UNLIKELY(x_abs >= 0x4500U)) {
- // exp10(NaN) = NaN
- if (x_bits.is_nan()) {
- if (x_bits.is_signaling_nan()) {
- fputil::raise_except_if_required(FE_INVALID);
- return FPBits::quiet_nan().get_val();
- }
-
- return x;
- }
-
- // When x >= 5.
- if (x_bits.is_pos()) {
- // exp10(+inf) = +inf
- if (x_bits.is_inf())
- return FPBits::inf().get_val();
-
- switch (fputil::quick_get_round()) {
- case FE_TONEAREST:
- case FE_UPWARD:
- fputil::set_errno_if_required(ERANGE);
- fputil::raise_except_if_required(FE_OVERFLOW);
- return FPBits::inf().get_val();
- default:
- return FPBits::max_normal().get_val();
- }
- }
-
- // When x <= -8.
- if (x_u >= 0xc800U) {
- // exp10(-inf) = +0
- if (x_bits.is_inf())
- return FPBits::zero().get_val();
-
- fputil::set_errno_if_required(ERANGE);
- fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-
- if (fputil::fenv_is_round_up())
- return FPBits::min_subnormal().get_val();
- return FPBits::zero().get_val();
- }
- }
-
- // When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results.
- if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
- switch (x_u) {
- case 0x3c00U: // x = 1.0f16
- return fputil::cast<float16>(10.0);
- case 0x4000U: // x = 2.0f16
- return fputil::cast<float16>(100.0);
- case 0x4200U: // x = 3.0f16
- return fputil::cast<float16>(1'000.0);
- case 0x4400U: // x = 4.0f16
- return fputil::cast<float16>(10'000.0);
- }
- }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
- if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
- return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
- // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
- auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
- return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
-}
+LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { return math::exp10f16(x); }
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
index 545c479..6c2fdbe 100644
--- a/libc/src/math/generic/exp10m1f16.cpp
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "src/math/exp10m1f16.h"
-#include "expxf16.h"
#include "hdr/errno_macros.h"
#include "hdr/fenv_macros.h"
#include "src/__support/FPUtil/FEnvImpl.h"
@@ -21,6 +20,7 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h"
#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/exp10f16_utils.h"
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 05ac95d..b17b14f 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -17,18 +17,11 @@
#include "src/__support/macros/config.h"
#include <stdint.h>
+#include "src/__support/math/exp10_float16_constants.h"
#include "src/__support/math/expf16_utils.h"
namespace LIBC_NAMESPACE_DECL {
-// Generated by Sollya with the following commands:
-// > display = hexadecimal;
-// > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN));
-constexpr cpp::array<uint32_t, 8> EXP2_MID_BITS = {
- 0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U,
- 0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U,
-};
-
LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
// For -25 < x < 16, to compute 2^x, we perform the following range reduction:
// find hi, mid, lo, such that:
@@ -68,53 +61,6 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
// Generated by Sollya with the following commands:
// > display = hexadecimal;
-// > round(log2(10), SG, RN);
-static constexpr float LOG2F_10 = 0x1.a934fp+1f;
-
-// Generated by Sollya with the following commands:
-// > display = hexadecimal;
-// > round(log10(2), SG, RN);
-static constexpr float LOG10F_2 = 0x1.344136p-2f;
-
-LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
- // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
- // find hi, mid, lo, such that:
- // x = (hi + mid) * log2(10) + lo, in which
- // hi is an integer,
- // mid * 2^3 is an integer,
- // -2^(-4) <= lo < 2^(-4).
- // In particular,
- // hi + mid = round(x * 2^3) * 2^(-3).
- // Then,
- // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
- // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
- // by adding hi to the exponent field of 2^mid. 10^lo is computed using a
- // degree-4 minimax polynomial generated by Sollya.
-
- float xf = x;
- float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
- int x_hi_mid = static_cast<int>(kf);
- unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3;
- unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7;
- // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
- float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
-
- uint32_t exp2_hi_mid_bits =
- EXP2_MID_BITS[x_mid] +
- static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
- float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
- // Degree-4 minimax polynomial generated by Sollya with the following
- // commands:
- // > display = hexadecimal;
- // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
- // > 1 + x * P;
- float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
- 0x1.04b434p+1f, 0x1.2bcf9ep+0f);
- return {exp2_hi_mid, exp10_lo};
-}
-
-// Generated by Sollya with the following commands:
-// > display = hexadecimal;
// > round(log2(exp(1)), SG, RN);
static constexpr float LOG2F_E = 0x1.715476p+0f;
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index 24935ce..11e4c3a 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -484,6 +484,21 @@ add_libc_test(
libc.include.llvm-libc-macros.math_function_macros
)
+add_libc_test(
+ math_constants_c_test
+ C_TEST
+ UNIT_TEST_ONLY
+ SUITE
+ libc_include_tests
+ SRCS
+ math_constants_test.c
+ COMPILE_OPTIONS
+ -Wall
+ -Werror
+ DEPENDS
+ libc.include.llvm-libc-macros.math_macros
+)
+
# Test `#include <...>` of each header in each available language mode.
# This is gated on -DLLVM_LIBC_BUILD_HEADER_TESTS=ON until all the bugs
# in headers are fixed so the tests all compile.
diff --git a/libc/test/include/math_constants_test.c b/libc/test/include/math_constants_test.c
new file mode 100644
index 0000000..eb497a9
--- /dev/null
+++ b/libc/test/include/math_constants_test.c
@@ -0,0 +1,23 @@
+//===-- Unittests for math constants --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "include/llvm-libc-macros/math-macros.h"
+
+#define IS_DOUBLE(X) _Generic((X), double: 1, default: 0)
+
+#define IS_FLOAT(X) _Generic((X), float: 1, default: 0)
+
+// check if macro is defined
+#ifndef M_PI
+#error "M_PI macro is not defined"
+#else
+int main(void) {
+ _Static_assert(IS_DOUBLE(M_PI), "M_PI is not of double type.");
+ _Static_assert(IS_FLOAT(M_PIf), "M_PIf is not of float type.");
+ return 0;
+}
+#endif
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6f18b61..d31ca01 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -10,7 +10,7 @@ Written by the `Libc++ Team <https://libcxx.llvm.org>`_
.. warning::
- These are in-progress notes for the upcoming libc++ 20.0.0 release.
+ These are in-progress notes for the upcoming libc++ 21.0.0 release.
Release notes for previous releases can be found on
`the Download Page <https://releases.llvm.org/download.html>`_.
@@ -18,7 +18,7 @@ Introduction
============
This document contains the release notes for the libc++ C++ Standard Library,
-part of the LLVM Compiler Infrastructure, release 20.0.0. Here we describe the
+part of the LLVM Compiler Infrastructure, release 21.0.0. Here we describe the
status of libc++ in some detail, including major improvements from the previous
release and new feature work. For the general LLVM release notes, see `the LLVM
documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM releases may
diff --git a/libcxx/include/__config b/libcxx/include/__config
index ee06abf..e442229 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -265,13 +265,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
// When this option is used, the token passed to `std::random_device`'s
// constructor *must* be "/dev/urandom" -- anything else is an error.
//
-// _LIBCPP_USING_NACL_RANDOM
-// NaCl's sandbox (which PNaCl also runs in) doesn't allow filesystem access,
-// including accesses to the special files under `/dev`. This implementation
-// uses the NaCL syscall `nacl_secure_random_init()` to get entropy.
-// When this option is used, the token passed to `std::random_device`'s
-// constructor *must* be "/dev/urandom" -- anything else is an error.
-//
// _LIBCPP_USING_WIN32_RANDOM
// Use rand_s(), for use on Windows.
// When this option is used, the token passed to `std::random_device`'s
@@ -283,8 +276,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
# define _LIBCPP_USING_GETENTROPY
# elif defined(__Fuchsia__)
# define _LIBCPP_USING_FUCHSIA_CPRNG
-# elif defined(__native_client__)
-# define _LIBCPP_USING_NACL_RANDOM
# elif defined(_LIBCPP_WIN32API)
# define _LIBCPP_USING_WIN32_RANDOM
# else
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index fc01aaf..b68c0c8 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -30,7 +30,6 @@
#cmakedefine01 _LIBCPP_HAS_LOCALIZATION
#cmakedefine01 _LIBCPP_HAS_UNICODE
#cmakedefine01 _LIBCPP_HAS_WIDE_CHARACTERS
-#cmakedefine _LIBCPP_HAS_NO_STD_MODULES
#cmakedefine01 _LIBCPP_HAS_TIME_ZONE_DATABASE
#cmakedefine01 _LIBCPP_INSTRUMENTED_WITH_ASAN
diff --git a/libcxx/include/limits b/libcxx/include/limits
index 1205e6a..e8581cf 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -219,7 +219,7 @@ protected:
static _LIBCPP_CONSTEXPR const bool is_bounded = true;
static _LIBCPP_CONSTEXPR const bool is_modulo = !std::is_signed<_Tp>::value;
-# if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__)
+# if defined(__i386__) || defined(__x86_64__) || defined(__wasm__)
static _LIBCPP_CONSTEXPR const bool traps = true;
# else
static _LIBCPP_CONSTEXPR const bool traps = false;
diff --git a/libcxx/src/random.cpp b/libcxx/src/random.cpp
index 5c66448..79815aa 100644
--- a/libcxx/src/random.cpp
+++ b/libcxx/src/random.cpp
@@ -31,8 +31,6 @@
# include <linux/random.h>
# include <sys/ioctl.h>
# endif
-#elif defined(_LIBCPP_USING_NACL_RANDOM)
-# include <nacl/nacl_random.h>
#elif defined(_LIBCPP_USING_FUCHSIA_CPRNG)
# include <zircon/syscalls.h>
#endif
@@ -93,30 +91,6 @@ unsigned random_device::operator()() {
return r;
}
-#elif defined(_LIBCPP_USING_NACL_RANDOM)
-
-random_device::random_device(const string& __token) {
- if (__token != "/dev/urandom")
- std::__throw_system_error(ENOENT, ("random device not supported " + __token).c_str());
- int error = nacl_secure_random_init();
- if (error)
- std::__throw_system_error(error, ("random device failed to open " + __token).c_str());
-}
-
-random_device::~random_device() {}
-
-unsigned random_device::operator()() {
- unsigned r;
- size_t n = sizeof(r);
- size_t bytes_written;
- int error = nacl_secure_random(&r, n, &bytes_written);
- if (error != 0)
- std::__throw_system_error(error, "random_device failed getting bytes");
- else if (bytes_written != n)
- std::__throw_runtime_error("random_device failed to obtain enough bytes");
- return r;
-}
-
#elif defined(_LIBCPP_USING_WIN32_RANDOM)
random_device::random_device(const string& __token) {
diff --git a/libcxx/test/libcxx/fuzzing/random.pass.cpp b/libcxx/test/libcxx/fuzzing/random.pass.cpp
index cb074bd..f0256a0 100644
--- a/libcxx/test/libcxx/fuzzing/random.pass.cpp
+++ b/libcxx/test/libcxx/fuzzing/random.pass.cpp
@@ -6,9 +6,10 @@
//
//===----------------------------------------------------------------------===//
-// This test fails because Clang no longer enables -fdelayed-template-parsing
-// by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21)
+// This doesn't work on Windows because in the MSVC UCRT headers the math.h is
+// actually intended to implement the full C++ spec requirements. For details
+// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828
+// XFAIL: msvc
// UNSUPPORTED: c++03, c++11
diff --git a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
index 1ba0063..f9f81d2 100644
--- a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
@@ -6,9 +6,10 @@
//
//===----------------------------------------------------------------------===//
-// This test fails because Clang no longer enables -fdelayed-template-parsing
-// by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21)
+// This doesn't work on Windows because in the MSVC UCRT headers the math.h is
+// actually intended to implement the full C++ spec requirements. For details
+// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828
+// XFAIL: msvc
// <math.h>
diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
index a9b1e44..66e149b 100644
--- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp
@@ -14,8 +14,7 @@
#include "test_macros.h"
-#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || \
- defined(__wasm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__wasm__)
static const bool integral_types_trap = true;
#else
static const bool integral_types_trap = false;
diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
index 48c2918..8d261e9 100644
--- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
@@ -6,9 +6,10 @@
//
//===----------------------------------------------------------------------===//
-// This test fails because Clang no longer enables -fdelayed-template-parsing
-// by default on Windows with C++20 (#69431).
-// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21)
+// This doesn't work on Windows because in the MSVC UCRT headers the math.h is
+// actually intended to implement the full C++ spec requirements. For details
+// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828
+// XFAIL: msvc
// <cmath>
diff --git a/lld/ELF/BPSectionOrderer.cpp b/lld/ELF/BPSectionOrderer.cpp
index f464b1d..0615204 100644
--- a/lld/ELF/BPSectionOrderer.cpp
+++ b/lld/ELF/BPSectionOrderer.cpp
@@ -76,10 +76,10 @@ DenseMap<const InputSectionBase *, int> elf::runBalancedPartitioning(
if (!d)
return;
auto *sec = dyn_cast_or_null<InputSection>(d->section);
- // Skip empty, discarded, ICF folded sections. Skipping ICF folded sections
- // reduces duplicate detection work in BPSectionOrderer.
+ // Skip empty, discarded, ICF folded sections, .bss. Skipping ICF folded
+ // sections reduces duplicate detection work in BPSectionOrderer.
if (!sec || sec->size == 0 || !sec->isLive() || sec->repl != sec ||
- !orderer.secToSym.try_emplace(sec, d).second)
+ !sec->content().data() || !orderer.secToSym.try_emplace(sec, d).second)
return;
rootSymbolToSectionIdxs[CachedHashStringRef(
lld::utils::getRootSymbol(sym.getName()))]
diff --git a/lld/test/ELF/bp-section-orderer.s b/lld/test/ELF/bp-section-orderer.s
index 4df2e8d..438d7c2 100644
--- a/lld/test/ELF/bp-section-orderer.s
+++ b/lld/test/ELF/bp-section-orderer.s
@@ -26,28 +26,28 @@
# RUN: ld.lld -o out.s a.o --irpgo-profile=a.profdata --bp-startup-sort=function
# RUN: llvm-nm -jn out.s | tr '\n' , | FileCheck %s --check-prefix=STARTUP
-# STARTUP: s5,s4,s3,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d3,d2,d1,{{$}}
+# STARTUP: s5,s4,s3,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d3,d2,d1,g1,{{$}}
# RUN: ld.lld -o out.os a.o --irpgo-profile=a.profdata --bp-startup-sort=function --symbol-ordering-file a.txt
# RUN: llvm-nm -jn out.os | tr '\n' , | FileCheck %s --check-prefix=ORDER-STARTUP
-# ORDER-STARTUP: s2,s1,s5,s4,s3,A,F,E,D,B,C,merged1,merged2,_start,d3,d2,d4,d1,{{$}}
+# ORDER-STARTUP: s2,s1,s5,s4,s3,A,F,E,D,B,C,merged1,merged2,_start,d3,d2,d4,d1,g1,{{$}}
# RUN: ld.lld -o out.cf a.o --verbose-bp-section-orderer --bp-compression-sort=function 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-FUNC
# RUN: ld.lld -o out.cf.icf a.o --verbose-bp-section-orderer --bp-compression-sort=function --icf=all --gc-sections 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-ICF-FUNC
# RUN: llvm-nm -jn out.cf | tr '\n' , | FileCheck %s --check-prefix=CFUNC
-# CFUNC: s5,s4,s3,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d3,d2,d1,{{$}}
+# CFUNC: s5,s4,s3,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d3,d2,d1,g1,{{$}}
# RUN: ld.lld -o out.cd a.o --verbose-bp-section-orderer --bp-compression-sort=data 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-DATA
# RUN: llvm-nm -jn out.cd | tr '\n' , | FileCheck %s --check-prefix=CDATA
-# CDATA: s5,s3,s4,s2,s1,F,C,E,D,B,A,merged1,merged2,_start,d4,d1,d3,d2,{{$}}
+# CDATA: s5,s3,s4,s2,s1,F,C,E,D,B,A,merged1,merged2,_start,d4,d1,d3,d2,g1,{{$}}
# RUN: ld.lld -o out.cb a.o --verbose-bp-section-orderer --bp-compression-sort=both 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-BOTH
# RUN: llvm-nm -jn out.cb | tr '\n' , | FileCheck %s --check-prefix=CBOTH
-# CBOTH: s5,s3,s4,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d1,d3,d2,{{$}}
+# CBOTH: s5,s3,s4,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d1,d3,d2,g1,{{$}}
# RUN: ld.lld -o out.cbs a.o --verbose-bp-section-orderer --bp-compression-sort=both --irpgo-profile=a.profdata --bp-startup-sort=function 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-BOTH
# RUN: llvm-nm -jn out.cbs | tr '\n' , | FileCheck %s --check-prefix=CBOTH-STARTUP
-# CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,{{$}}
+# CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,g1,{{$}}
# BP-COMPRESSION-FUNC: Ordered 9 sections ([[#]] bytes) using balanced partitioning
# BP-COMPRESSION-ICF-FUNC: Ordered 8 sections ([[#]] bytes) using balanced partitioning
@@ -108,6 +108,7 @@ d3
d2
#--- a.c
+int g1;
const char s5[] = "engineering";
const char s4[] = "computer program";
const char s3[] = "hardware engineer";
@@ -377,6 +378,14 @@ d1:
.word 6 // 0x6
.size d1, 16
+ .type g1,@object // @g1
+ .section .bss.g1,"aw",@nobits
+ .globl g1
+ .p2align 2, 0x0
+g1:
+ .word 0 // 0x0
+ .size g1, 4
+
.section ".note.GNU-stack","",@progbits
.addrsig
.addrsig_sym F
diff --git a/lld/test/ELF/hexagon-plt.s b/lld/test/ELF/hexagon-plt.s
index 679de82..780dc43 100644
--- a/lld/test/ELF/hexagon-plt.s
+++ b/lld/test/ELF/hexagon-plt.s
@@ -30,31 +30,31 @@
# DIS: <_start>:
## Direct call
## Call foo directly
-# DIS-NEXT: { call 0x2003c }
+# DIS-NEXT: { call 0x2003c <foo> }
## Call bar via plt
-# DIS-NEXT: { call 0x20060 }
+# DIS-NEXT: { call 0x20060 <bar@plt> }
## Call weak via plt
-# DIS-NEXT: { call 0x20070 }
+# DIS-NEXT: { call 0x20070 <weak@plt> }
# DIS-NEXT: { immext(#0)
## Call foo directly
-# DIS-NEXT: if (p0) jump:nt 0x2003c }
+# DIS-NEXT: if (p0) jump:nt 0x2003c <foo> }
# DIS-NEXT: { immext(#64)
## Call bar via plt
-# DIS-NEXT: if (p0) jump:nt 0x20060 }
+# DIS-NEXT: if (p0) jump:nt 0x20060 <bar@plt> }
# DIS-NEXT: { immext(#64)
## Call weak via plt
-# DIS-NEXT: if (p0) jump:nt 0x20070 }
+# DIS-NEXT: if (p0) jump:nt 0x20070 <weak@plt> }
# DIS-NEXT: { immext(#0)
## Call foo directly
-# DIS-NEXT: r0 = #0 ; jump 0x2003c }
+# DIS-NEXT: r0 = #0 ; jump 0x2003c <foo> }
# DIS-NEXT: { immext(#0)
## Call bar via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20060 }
+# DIS-NEXT: r0 = #0 ; jump 0x20060 <bar@plt> }
# DIS-NEXT: { immext(#0)
## Call weak via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20070 }
+# DIS-NEXT: r0 = #0 ; jump 0x20070 <weak@plt> }
# DIS: <foo>:
# DIS-NEXT: 2003c:
diff --git a/lld/test/ELF/hexagon-shared.s b/lld/test/ELF/hexagon-shared.s
index cc62662..7f7390f 100644
--- a/lld/test/ELF/hexagon-shared.s
+++ b/lld/test/ELF/hexagon-shared.s
@@ -88,7 +88,7 @@ pvar:
# PLT-NEXT: jumpr r28 }
# TEXT: bc 00 01 00 000100bc
-# TEXT: { call 0x10300 }
+# TEXT: { call 0x10300 <bar@plt> }
# TEXT: if (p0) jump:nt 0x10300
# TEXT: r0 = #0 ; jump 0x10300
# TEXT: r0 = add(r1,##-65548)
diff --git a/lld/test/ELF/hexagon-tls-gd-xform.s b/lld/test/ELF/hexagon-tls-gd-xform.s
index 65aeb11..ade54e8 100644
--- a/lld/test/ELF/hexagon-tls-gd-xform.s
+++ b/lld/test/ELF/hexagon-tls-gd-xform.s
@@ -18,10 +18,10 @@
_start:
.ifdef GDPLT
call x@gdplt
-# CHECK_GDPLT: 101ec: { call 0x10220 }
+# CHECK_GDPLT: 101ec: { call 0x10220 <__tls_get_addr@plt> }
.else
call x
-# CHECK: 101b8: { call 0x101e0 }
+# CHECK: 101b8: { call 0x101e0 <x@plt> }
.endif
# CHECK_GDPLT: 10220: { immext(#0x20040)
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index 40493df..05fcc4d 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -854,7 +854,7 @@ llvm::Error ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed,
PrintChildren(value_printed, summary_printed, curr_ptr_depth);
} else if (HasReachedMaximumDepth() && IsAggregate() &&
ShouldPrintValueObject()) {
- m_stream->PutCString("{...}\n");
+ m_stream->PutCString(" {...}\n");
// The maximum child depth has been reached. If `m_max_depth` is the default
// (i.e. the user has _not_ customized it), then lldb presents a warning to
// the user. The warning tells the user that the limit has been reached, but
diff --git a/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py b/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py
index 1dfd7df..3364867 100644
--- a/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py
+++ b/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py
@@ -19,10 +19,10 @@ class TestFrameVarDepthAndElemCount(TestBase):
self.expect(
"frame var --depth 2 --element-count 5 -- c",
substrs=[
- "[0] = {\n b ={...}\n }",
- "[1] = {\n b ={...}\n }",
- "[2] = {\n b ={...}\n }",
- "[3] = {\n b ={...}\n }",
- "[4] = {\n b ={...}\n }",
+ "[0] = {\n b = {...}\n }",
+ "[1] = {\n b = {...}\n }",
+ "[2] = {\n b = {...}\n }",
+ "[3] = {\n b = {...}\n }",
+ "[4] = {\n b = {...}\n }",
],
)
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index de5f66c..87d2a9a 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -263,6 +263,20 @@ amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitH
#### PowerPC backend
+Amy Kwan (esp. release issues) \
+Amy.Kwan1@ibm.com (email), [amy-kwan](https://github.com/amy-kwan) (GitHub) \
+Lei Huang \
+lei@ca.ibm.com (email), [lei137](https://github.com/lei137) (GitHub) \
+Sean Fertile (esp. ABI/ELF/XCOFF) \
+sfertile@ca.ibm.com (email), [mandlebug](https://github.com/mandlebug) (GitHub) \
+Zhijian Lin \
+zhijian@ca.ibm.com (email), [diggerlin](https://github.com/diggerlin) (GitHub) \
+Maryam Moghadas \
+maryammo@ca.ibm.com (email), [maryammo](https://github.com/maryammo) (GitHub) \
+Roland Froese \
+froese@ca.ibm.com (email), [RolandF77](https://github.com/RolandF77) (GitHub) \
+llvmonpower \
+powerllvm@ca.ibm.com (email), [llvmonpower](https://github.com/llvmonpower) (GitHub)
#### RISCV backend
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index c614a6d..732227b 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -30,7 +30,7 @@ because the naming and other conventions are dictated by the C++ standard.
There are some conventions that are not uniformly followed in the code base
(e.g. the naming convention). This is because they are relatively new, and a
-lot of code was written before they were put in place. Our long term goal is
+lot of code was written before they were put in place. Our long-term goal is
for the entire codebase to follow the convention, but we explicitly *do not*
want patches that do large-scale reformatting of existing code. On the other
hand, it is reasonable to rename the methods of a class if you're about to
@@ -50,7 +50,7 @@ code imported into the tree. Generally, our preference is for standards
conforming, modern, and portable C++ code as the implementation language of
choice.
-For automation, build-systems and utility scripts Python is preferred and
+For automation, build-systems, and utility scripts, Python is preferred and
is widely used in the LLVM repository already.
C++ Standard Versions
@@ -92,7 +92,7 @@ LLVM support libraries (for example, `ADT
<https://github.com/llvm/llvm-project/tree/main/llvm/include/llvm/ADT>`_)
implement specialized data structures or functionality missing in the standard
library. Such libraries are usually implemented in the ``llvm`` namespace and
-follow the expected standard interface, when there is one.
+follow the expected standard interface when there is one.
When both C++ and the LLVM support libraries provide similar functionality, and
there isn't a specific reason to favor the C++ implementation, it is generally
@@ -325,8 +325,8 @@ implementation file. In any case, implementation files can include additional
comments (not necessarily in Doxygen markup) to explain implementation details
as needed.
-Don't duplicate function or class name at the beginning of the comment.
-For humans it is obvious which function or class is being documented;
+Don't duplicate the function or class name at the beginning of the comment.
+For humans, it is obvious which function or class is being documented;
automatic documentation processing tools are smart enough to bind the comment
to the correct declaration.
@@ -369,7 +369,7 @@ lower-case letter, and finish the last sentence without a period, if it would
end in one otherwise. Sentences which end with different punctuation, such as
"did you forget ';'?", should still do so.
-For example this is a good error message:
+For example, this is a good error message:
.. code-block:: none
@@ -443,7 +443,7 @@ Write your code to fit within 80 columns.
There must be some limit to the width of the code in
order to allow developers to have multiple files side-by-side in
windows on a modest display. If you are going to pick a width limit, it is
-somewhat arbitrary but you might as well pick something standard. Going with 90
+somewhat arbitrary, but you might as well pick something standard. Going with 90
columns (for example) instead of 80 columns wouldn't add any significant value
and would be detrimental to printing out code. Also many other projects have
standardized on 80 columns, so some people have already configured their editors
@@ -520,7 +520,7 @@ within each other and within function calls in order to build up aggregates
The historically common formatting of braced initialization of aggregate
variables does not mix cleanly with deep nesting, general expression contexts,
function arguments, and lambdas. We suggest new code use a simple rule for
-formatting braced initialization lists: act as-if the braces were parentheses
+formatting braced initialization lists: act as if the braces were parentheses
in a function call. The formatting rules exactly match those already well
understood for formatting nested function calls. Examples:
@@ -607,11 +607,11 @@ Static constructors and destructors (e.g., global variables whose types have a
constructor or destructor) should not be added to the code base, and should be
removed wherever possible.
-Globals in different source files are initialized in `arbitrary order
+Globals in different source files are initialized in an `arbitrary order
<https://yosefk.com/c++fqa/ctors.html#fqa-10.12>`_, making the code more
difficult to reason about.
-Static constructors have negative impact on launch time of programs that use
+Static constructors have a negative impact on the launch time of programs that use
LLVM as a library. We would really like for there to be zero cost for linking
in an additional LLVM target or other library into an application, but static
constructors undermine this goal.
@@ -698,7 +698,7 @@ If you use a braced initializer list when initializing a variable, use an equals
Use ``auto`` Type Deduction to Make Code More Readable
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Some are advocating a policy of "almost always ``auto``" in C++11, however LLVM
+Some are advocating a policy of "almost always ``auto``" in C++11; however, LLVM
uses a more moderate stance. Use ``auto`` if and only if it makes the code more
readable or easier to maintain. Don't "almost always" use ``auto``, but do use
``auto`` with initializers like ``cast<Foo>(...)`` or other places where the
@@ -783,14 +783,14 @@ guards, and might not include their prerequisites. Name such files with the
In general, a header should be implemented by one or more ``.cpp`` files. Each
of these ``.cpp`` files should include the header that defines their interface
-first. This ensures that all of the dependences of the header have been
+first. This ensures that all of the dependencies of the header have been
properly added to the header itself, and are not implicit. System headers
should be included after user headers for a translation unit.
Library Layering
^^^^^^^^^^^^^^^^
-A directory of header files (for example ``include/llvm/Foo``) defines a
+A directory of header files (for example, ``include/llvm/Foo``) defines a
library (``Foo``). One library (both
its headers and implementation) should only use things from the libraries
listed in its dependencies.
@@ -822,7 +822,7 @@ especially in header files.
But wait! Sometimes you need to have the definition of a class to use it, or to
inherit from it. In these cases go ahead and ``#include`` that header file. Be
-aware however that there are many cases where you don't need to have the full
+aware, however, that there are many cases where you don't need to have the full
definition of a class. If you are using a pointer or reference to a class, you
don't need the header file. If you are simply returning a class instance from a
prototyped function or method, you don't need it. In fact, for most cases, you
@@ -970,7 +970,7 @@ loops. A silly example is something like this:
When you have very, very small loops, this sort of structure is fine. But if it
exceeds more than 10-15 lines, it becomes difficult for people to read and
understand at a glance. The problem with this sort of code is that it gets very
-nested very quickly. Meaning that the reader of the code has to keep a lot of
+nested very quickly. This means that the reader of the code has to keep a lot of
context in their brain to remember what is going immediately on in the loop,
because they don't know if/when the ``if`` conditions will have ``else``\s etc.
It is strongly preferred to structure the loop like this:
@@ -988,7 +988,7 @@ It is strongly preferred to structure the loop like this:
...
}
-This has all the benefits of using early exits for functions: it reduces nesting
+This has all the benefits of using early exits for functions: it reduces the nesting
of the loop, it makes it easier to describe why the conditions are true, and it
makes it obvious to the reader that there is no ``else`` coming up that they
have to push context into their brain for. If a loop is large, this can be a
@@ -1149,12 +1149,12 @@ In general, names should be in camel case (e.g. ``TextFileReader`` and
nouns and start with an upper-case letter (e.g. ``TextFileReader``).
* **Variable names** should be nouns (as they represent state). The name should
- be camel case, and start with an upper case letter (e.g. ``Leader`` or
+ be camel case, and start with an upper-case letter (e.g. ``Leader`` or
``Boats``).
* **Function names** should be verb phrases (as they represent actions), and
command-like function should be imperative. The name should be camel case,
- and start with a lower case letter (e.g. ``openFile()`` or ``isFoo()``).
+ and start with a lower-case letter (e.g. ``openFile()`` or ``isFoo()``).
* **Enum declarations** (e.g. ``enum Foo {...}``) are types, so they should
follow the naming conventions for types. A common use for enums is as a
@@ -1207,7 +1207,7 @@ Assert Liberally
^^^^^^^^^^^^^^^^
Use the "``assert``" macro to its fullest. Check all of your preconditions and
-assumptions, you never know when a bug (not necessarily even yours) might be
+assumptions. You never know when a bug (not necessarily even yours) might be
caught early by an assertion, which reduces debugging time dramatically. The
"``<cassert>``" header file is probably already included by the header files you
are using, so it doesn't cost anything to use it.
@@ -1302,7 +1302,7 @@ preferred to write the code like this:
assert(NewToSet && "The value shouldn't be in the set yet");
In C code where ``[[maybe_unused]]`` is not supported, use ``void`` cast to
-suppress unused variable warning as follows:
+suppress an unused variable warning as follows:
.. code-block:: c
@@ -1546,7 +1546,7 @@ whenever possible.
The semantics of postincrement include making a copy of the value being
incremented, returning it, and then preincrementing the "work value". For
primitive types, this isn't a big deal. But for iterators, it can be a huge
-issue (for example, some iterators contains stack and set objects in them...
+issue (for example, some iterators contain stack and set objects in them...
copying an iterator could invoke the copy ctor's of these as well). In general,
get in the habit of always using preincrement, and you won't have a problem.
@@ -1663,7 +1663,7 @@ Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements
When writing the body of an ``if``, ``else``, or for/while loop statement, we
prefer to omit the braces to avoid unnecessary line noise. However, braces
-should be used in cases where the omission of braces harm the readability and
+should be used in cases where the omission of braces harms the readability and
maintainability of the code.
We consider that readability is harmed when omitting the brace in the presence
@@ -1763,7 +1763,7 @@ would help to avoid running into a "dangling else" situation.
handleAttrOnDecl(D, A, i);
}
- // Use braces on the outer block because of a nested `if`; otherwise the
+ // Use braces on the outer block because of a nested `if`; otherwise, the
// compiler would warn: `add explicit braces to avoid dangling else`
if (auto *D = dyn_cast<FunctionDecl>(D)) {
if (shouldProcess(D))
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index bad72c6c..d8fb87b 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -581,6 +581,26 @@ This section stores pairs of (jump table address, number of entries).
This information is useful for tools that need to statically reconstruct
the control flow of executables.
+``SHT_LLVM_CFI_JUMP_TABLE`` Section (CFI jump table)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This section contains the instructions that make up a `CFI jump table`_.
+It is expected to be ``SHF_ALLOC`` and may be laid out like a normal
+section. The ``SHT_LLVM_CFI_JUMP_TABLE`` section type gives the linker
+permission to modify the section in ways that would not normally be
+permitted, in order to optimize calls via the jump table.
+
+Each ``sh_entsize`` sized slice of a section of this type containing
+exactly one relocation may be considered to be a jump table entry
+that branches to the target of the relocation. This allows the linker
+to replace the jump table entry with the function body if it is small
+enough, or if the function is the last function in the jump table.
+
+A section of this type does not have to be placed according to its
+name. The linker may place the section in whichever output section it
+sees fit (generally the section that would provide the best locality).
+
+.. _CFI jump table: https://clang.llvm.org/docs/ControlFlowIntegrityDesign.html#forward-edge-cfi-for-indirect-function-calls
+
CodeView-Dependent
------------------
diff --git a/llvm/include/llvm/ADT/CombinationGenerator.h b/llvm/include/llvm/ADT/CombinationGenerator.h
index 6100aa9..bbdbd9b 100644
--- a/llvm/include/llvm/ADT/CombinationGenerator.h
+++ b/llvm/include/llvm/ADT/CombinationGenerator.h
@@ -118,10 +118,9 @@ public:
: VariablesChoices(VariablesChoices_) {
#ifndef NDEBUG
assert(!VariablesChoices.empty() && "There should be some variables.");
- llvm::for_each(VariablesChoices, [](ArrayRef<choice_type> VariableChoices) {
+ for (ArrayRef<choice_type> VariableChoices : VariablesChoices)
assert(!VariableChoices.empty() &&
"There must always be some choice, at least a placeholder one.");
- });
#endif
}
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index 7bd2c87..81b9a68 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -55,21 +55,13 @@ using type_identity_t // NOLINT(readability-identifier-naming)
// TODO: Remove this in favor of std::optional<T>::transform once we switch to
// C++23.
-template <typename T, typename Function>
-auto transformOptional(const std::optional<T> &O, const Function &F)
- -> std::optional<decltype(F(*O))> {
- if (O)
- return F(*O);
- return std::nullopt;
-}
-
-// TODO: Remove this in favor of std::optional<T>::transform once we switch to
-// C++23.
-template <typename T, typename Function>
-auto transformOptional(std::optional<T> &&O, const Function &F)
- -> std::optional<decltype(F(*std::move(O)))> {
- if (O)
- return F(*std::move(O));
+template <typename Optional, typename Function,
+ typename Value = typename llvm::remove_cvref_t<Optional>::value_type>
+std::optional<std::invoke_result_t<Function, Value>>
+transformOptional(Optional &&O, Function &&F) {
+ if (O) {
+ return F(*std::forward<Optional>(O));
+ }
return std::nullopt;
}
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 6bf2e17..e4f82ad 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1159,6 +1159,7 @@ enum : unsigned {
SHT_LLVM_OFFLOADING = 0x6fff4c0b, // LLVM device offloading data.
SHT_LLVM_LTO = 0x6fff4c0c, // .llvm.lto for fat LTO.
SHT_LLVM_JT_SIZES = 0x6fff4c0d, // LLVM jump tables sizes.
+ SHT_LLVM_CFI_JUMP_TABLE = 0x6fff4c0e, // LLVM CFI jump table.
// Android's experimental support for SHT_RELR sections.
// https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
SHT_ANDROID_RELR = 0x6fffff00, // Relocation entries; only offsets.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d8fda0e..ecda6c4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3572,6 +3572,12 @@ def int_amdgcn_cvt_f16_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_bf8">,
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
+def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
+
+def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
+ DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
+
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 3a7ca1a..cae2fbc 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -136,6 +136,18 @@ public:
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &CStream) const = 0;
+ /// Returns the disassembly of an instruction bundle for VLIW architectures
+ /// like Hexagon.
+ ///
+ /// \param Instr - An MCInst to populate with the contents of
+ /// the Bundle with sub-instructions encoded as Inst operands.
+ virtual DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CStream) const {
+ return Fail;
+ }
+
/// Used to perform separate target specific disassembly for a particular
/// symbol. May parse any prelude that precedes instructions after the
/// start of a symbol, or the entire symbol.
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index a55fd4a..319e131 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -73,20 +73,9 @@ public:
MCSymbol *emitCFILabel() override;
void emitCFISections(bool EH, bool Debug) override;
- void insert(MCFragment *F) {
- auto *Sec = CurFrag->getParent();
- F->setParent(Sec);
- F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
- CurFrag->Next = F;
- CurFrag = F;
- Sec->curFragList()->Tail = F;
- }
-
/// Get a data fragment to write into, creating a new one if the current
/// fragment is not FT_Data.
- /// Optionally a \p STI can be passed in so that a new fragment is created
- /// if the Subtarget differs from the current fragment.
- MCFragment *getOrCreateDataFragment(const MCSubtargetInfo *STI = nullptr);
+ MCFragment *getOrCreateDataFragment();
protected:
bool changeSectionImpl(MCSection *Section, uint32_t Subsection);
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 296fdd8..313071e 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -188,6 +188,7 @@ public:
// destructors.
class MCFragment {
friend class MCAssembler;
+ friend class MCStreamer;
friend class MCObjectStreamer;
friend class MCSection;
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index b3a9aab..4b91dbc 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -429,7 +429,6 @@ public:
CurFrag->getParent() == getCurrentSection().first);
return CurFrag;
}
-
/// Save the current and previous section on the section stack.
void pushSection() {
SectionStack.push_back(
@@ -457,6 +456,9 @@ public:
MCSymbol *endSection(MCSection *Section);
+ void insert(MCFragment *F);
+ void newFragment();
+
/// Returns the mnemonic for \p MI, if the streamer has access to a
/// instruction printer and returns an empty string otherwise.
virtual StringRef getMnemonic(const MCInst &MI) const { return ""; }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 40464e9..fed5e72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7641,7 +7641,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (SDValue(GN0, 0).hasOneUse() &&
isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
- TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+ TLI.isVectorLoadExtDesirable(SDValue(N, 0))) {
SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
@@ -15745,7 +15745,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
// fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
if (SDValue(GN0, 0).hasOneUse() && ExtVT == GN0->getMemoryVT() &&
- TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+ TLI.isVectorLoadExtDesirable(SDValue(N, 0))) {
SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
@@ -16772,12 +16772,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
/*Depth*/ 1))
continue;
- bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
- bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second;
- if (IsNewMaybePoisonOperand)
+ if (MaybePoisonOperands.insert(Op).second)
MaybePoisonOperandNumbers.push_back(OpNo);
- if (!HadMaybePoisonOperands)
- continue;
}
// NOTE: the whole op may be not guaranteed to not be undef or poison because
// it could create undef or poison due to it's poison-generating flags.
@@ -18727,6 +18723,12 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
return FTrunc;
+ // fold (sint_to_fp (trunc nsw x)) -> (sint_to_fp x)
+ if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoSignedWrap() &&
+ TLI.isTypeDesirableForOp(ISD::SINT_TO_FP,
+ N0.getOperand(0).getValueType()))
+ return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0.getOperand(0));
+
return SDValue();
}
@@ -18764,6 +18766,12 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
return FTrunc;
+ // fold (uint_to_fp (trunc nuw x)) -> (uint_to_fp x)
+ if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoUnsignedWrap() &&
+ TLI.isTypeDesirableForOp(ISD::UINT_TO_FP,
+ N0.getOperand(0).getValueType()))
+ return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0.getOperand(0));
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5453828..2458115 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5544,6 +5544,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::USUBSAT:
case ISD::MULHU:
case ISD::MULHS:
+ case ISD::ABDU:
+ case ISD::ABDS:
case ISD::SMIN:
case ISD::SMAX:
case ISD::UMIN:
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 559d808..222dc88 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -43,12 +43,6 @@ namespace llvm {
using namespace dwarf_linker;
using namespace dwarf_linker::classic;
-enum InvalidStmtSeqOffset {
- MaxStmtSeqOffset = UINT64_MAX,
- OrigOffsetMissing = MaxStmtSeqOffset - 1,
- NewOffsetMissing = MaxStmtSeqOffset - 2,
-};
-
/// Hold the input and output of the debug info size in bytes.
struct DebugInfoSize {
uint64_t Input;
@@ -2321,7 +2315,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
// Some sequences are discarded by the DWARFLinker if they are invalid
// (empty).
if (OrigRowIter == SeqOffToOrigRow.end()) {
- StmtSeq.set(OrigOffsetMissing);
+ StmtSeq.set(UINT64_MAX);
continue;
}
size_t OrigRowIndex = OrigRowIter->second;
@@ -2331,7 +2325,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
if (NewRowIter == OrigRowToNewRow.end()) {
// If the original row index is not found in the map, update the
// stmt_sequence attribute to the 'invalid offset' magic value.
- StmtSeq.set(NewOffsetMissing);
+ StmtSeq.set(UINT64_MAX);
continue;
}
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 67433f2..d5b8f22 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -106,26 +106,12 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
MCDwarfFrameEmitter::Emit(*this, MAB, false);
}
-static bool canReuseDataFragment(const MCFragment &F,
- const MCAssembler &Assembler,
- const MCSubtargetInfo *STI) {
- if (!F.hasInstructions())
- return true;
- // Do not add data after a linker-relaxable instruction. The difference
- // between a new label and a label at or before the linker-relaxable
- // instruction cannot be resolved at assemble-time.
- if (F.isLinkerRelaxable())
- return false;
- // If the subtarget is changed mid fragment we start a new fragment to record
- // the new STI.
- return !STI || F.getSubtargetInfo() == STI;
-}
-
-MCFragment *
-MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
+MCFragment *MCObjectStreamer::getOrCreateDataFragment() {
+ // TODO: Start a new fragment whenever finalizing the variable-size tail of a
+ // previous one, so that all getOrCreateDataFragment calls can be replaced
+ // with getCurrentFragment
auto *F = getCurrentFragment();
- if (F->getKind() != MCFragment::FT_Data ||
- !canReuseDataFragment(*F, *Assembler, STI)) {
+ if (F->getKind() != MCFragment::FT_Data) {
F = getContext().allocFragment<MCFragment>();
insert(F);
}
@@ -363,16 +349,23 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst,
F->doneAppending();
if (!Fixups.empty())
F->appendFixups(Fixups);
+ F->setHasInstructions(STI);
+ bool MarkedLinkerRelaxable = false;
for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) {
Fixup.setOffset(Fixup.getOffset() + CodeOffset);
- if (Fixup.isLinkerRelaxable()) {
- F->setLinkerRelaxable();
+ if (!Fixup.isLinkerRelaxable())
+ continue;
+ F->setLinkerRelaxable();
+ // Do not add data after a linker-relaxable instruction. The difference
+ // between a new label and a label at or before the linker-relaxable
+ // instruction cannot be resolved at assemble-time.
+ if (!MarkedLinkerRelaxable) {
+ MarkedLinkerRelaxable = true;
getCurrentSectionOnly()->setLinkerRelaxable();
+ newFragment();
}
}
-
- F->setHasInstructions(STI);
}
void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
@@ -568,8 +561,10 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment,
// if the alignment is larger than the minimum NOP size.
unsigned Size;
if (getAssembler().getBackend().shouldInsertExtraNopBytesForCodeAlign(*F,
- Size))
+ Size)) {
getCurrentSectionOnly()->setLinkerRelaxable();
+ newFragment();
+ }
}
void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index ec8b402..c7c3df3 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -571,7 +571,7 @@ bool ELFAsmParser::parseSectionArguments(bool IsPush, SMLoc loc) {
return TokError("expected end of directive");
}
- if (Mergeable)
+ if (Mergeable || TypeName == "llvm_cfi_jump_table")
if (parseMergeSize(Size))
return true;
if (Flags & ELF::SHF_LINK_ORDER)
@@ -637,6 +637,8 @@ EndStmt:
Type = ELF::SHT_LLVM_LTO;
else if (TypeName == "llvm_jt_sizes")
Type = ELF::SHT_LLVM_JT_SIZES;
+ else if (TypeName == "llvm_cfi_jump_table")
+ Type = ELF::SHT_LLVM_CFI_JUMP_TABLE;
else if (TypeName.getAsInteger(0, Type))
return TokError("unknown section type");
}
diff --git a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
index 665d92e..7f09349 100644
--- a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -9,6 +9,7 @@
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCStreamer.h"
using namespace llvm;
@@ -22,6 +23,10 @@ MCTargetAsmParser::~MCTargetAsmParser() = default;
MCSubtargetInfo &MCTargetAsmParser::copySTI() {
MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI());
STI = &STICopy;
+ // The returned STI will likely be modified. Create a new fragment to prevent
+ // mixing STI values within a fragment.
+ if (getStreamer().getCurrentFragment())
+ getStreamer().newFragment();
return STICopy;
}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index cc7cdf2..299fe40 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -176,11 +176,13 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
OS << "llvm_lto";
else if (Type == ELF::SHT_LLVM_JT_SIZES)
OS << "llvm_jt_sizes";
+ else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
+ OS << "llvm_cfi_jump_table";
else
OS << "0x" << Twine::utohexstr(Type);
if (EntrySize) {
- assert(Flags & ELF::SHF_MERGE);
+ assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE);
OS << "," << EntrySize;
}
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index d814ab88..c3ecf8f 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1404,6 +1404,19 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
return Sym;
}
+void MCStreamer::insert(MCFragment *F) {
+ auto *Sec = CurFrag->getParent();
+ F->setParent(Sec);
+ F->setLayoutOrder(CurFrag->getLayoutOrder() + 1);
+ CurFrag->Next = F;
+ CurFrag = F;
+ Sec->curFragList()->Tail = F;
+}
+
+void MCStreamer::newFragment() {
+ insert(getContext().allocFragment<MCFragment>());
+}
+
static VersionTuple
targetVersionOrMinimumSupportedOSVersion(const Triple &Target,
VersionTuple TargetVersion) {
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index af073f6..788c602 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -321,6 +321,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_OFFLOADING);
STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO);
STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES)
+ STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CFI_JUMP_TABLE)
STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME);
STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d04e6c4..f026726 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6439,7 +6439,9 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
}
}
- return true;
+ EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
+ return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
+ PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
}
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index cbbb57c..bf2f37b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4558,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_u16:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
+ case Intrinsic::amdgcn_sat_pk4_i4_i8:
+ case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ab7d340..9e1951e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2850,6 +2850,7 @@ def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
+def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>;
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2a6fcad..991d9f8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3427,30 +3427,32 @@ def : GCNPat <
(S_LSHL_B32 SReg_32:$src1, (i16 16))
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
(v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
>;
-
def : GCNPat <
- (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
- (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
+ (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
+ (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
def : GCNPat <
- (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))),
- (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
+ (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
>;
+}
def : GCNPat <
- (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
(S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
def : GCNPat <
- (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))),
- (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1))
+ (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+ (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
>;
foreach vecTy = [v2i16, v2f16, v2bf16] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1bbbb61..f621f85 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -803,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in {
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>;
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>;
}
+
+ defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>;
+ defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>;
} // End SubtargetPredicate = isGFX1250Plus
let SubtargetPredicate = isGFX10Plus in {
@@ -1080,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
}
+multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>;
+}
+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
@@ -1147,8 +1157,12 @@ defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
+defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
+defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
+defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 2b91ea7..a25ebdf 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
// Special case for v_permlane16_swap_b32/v_permlane32_swap_b32
// op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands.
-class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
+class VOP3OpSelIsDPP_base {
bits<1> fi;
bits<1> bound_ctrl;
+}
+
+class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> {
+ // OPSEL[0] specifies FI
+ let Inst{11} = fi;
+ // OPSEL[1] specifies BOUND_CTRL
+ let Inst{12} = bound_ctrl;
+}
+class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> {
// OPSEL[0] specifies FI
let Inst{11} = fi;
// OPSEL[1] specifies BOUND_CTRL
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5bd3170..22cff7c 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -43,12 +43,12 @@ namespace {
class HexagonDisassembler : public MCDisassembler {
public:
std::unique_ptr<MCInstrInfo const> const MCII;
- std::unique_ptr<MCInst *> CurrentBundle;
+ mutable std::unique_ptr<MCInst> CurrentBundle;
mutable MCInst const *CurrentExtender;
HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
MCInstrInfo const *MCII)
- : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *),
+ : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr),
CurrentExtender(nullptr) {}
DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
@@ -57,7 +57,23 @@ public:
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &CStream) const override;
+
+ DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes, uint64_t Address,
+ raw_ostream &CStream) const override;
+
void remapInstruction(MCInst &Instr) const;
+
+private:
+ bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &BytesToSkip, raw_ostream &CS) const;
+
+ void resetBundle() const {
+ CurrentBundle.reset();
+ CurrentInstruction = nullptr;
+ }
+
+ mutable MCOperand *CurrentInstruction = nullptr;
};
static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI,
@@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() {
createHexagonDisassembler);
}
-DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
- ArrayRef<uint8_t> Bytes,
- uint64_t Address,
- raw_ostream &CS) const {
- CommentStream = &CS;
-
- DecodeStatus Result = DecodeStatus::Success;
+bool HexagonDisassembler::makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+ uint64_t &BytesToSkip,
+ raw_ostream &CS) const {
bool Complete = false;
- Size = 0;
+ DecodeStatus Result = DecodeStatus::Success;
- *CurrentBundle = &MI;
- MI.setOpcode(Hexagon::BUNDLE);
- MI.addOperand(MCOperand::createImm(0));
+ CurrentBundle.reset(new MCInst);
+ CurrentBundle->setOpcode(Hexagon::BUNDLE);
+ CurrentBundle->addOperand(MCOperand::createImm(0));
while (Result == Success && !Complete) {
if (Bytes.size() < HEXAGON_INSTR_SIZE)
- return MCDisassembler::Fail;
+ return false;
MCInst *Inst = getContext().createMCInst();
- Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete);
- MI.addOperand(MCOperand::createInst(Inst));
- Size += HEXAGON_INSTR_SIZE;
+ Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS,
+ Complete);
+ CurrentBundle->addOperand(MCOperand::createInst(Inst));
+ BytesToSkip += HEXAGON_INSTR_SIZE;
Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
}
if (Result == MCDisassembler::Fail)
- return Result;
- if (Size > HEXAGON_MAX_PACKET_SIZE)
- return MCDisassembler::Fail;
+ return false;
+ if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE)
+ return false;
const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI;
- HexagonMCChecker Checker(getContext(), *MCII, STI_, MI,
+ HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle,
*getContext().getRegisterInfo(), false);
if (!Checker.check())
- return MCDisassembler::Fail;
- remapInstruction(MI);
+ return false;
+ remapInstruction(*CurrentBundle);
+ return true;
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+
+ Size = 0;
+ uint64_t BytesToSkip = 0;
+
+ if (!CurrentBundle) {
+ if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+ Size = BytesToSkip;
+ resetBundle();
+ return MCDisassembler::Fail;
+ }
+ CurrentInstruction = (CurrentBundle->begin() + 1);
+ }
+
+ MI = *(CurrentInstruction->getInst());
+ Size = HEXAGON_INSTR_SIZE;
+ if (++CurrentInstruction == CurrentBundle->end())
+ resetBundle();
return MCDisassembler::Success;
}
+DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI,
+ uint64_t &Size,
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
+ Size = 0;
+ uint64_t BytesToSkip = 0;
+ assert(!CurrentBundle);
+
+ if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+ Size = BytesToSkip;
+ resetBundle();
+ return MCDisassembler::Fail;
+ }
+
+ MI = *CurrentBundle;
+ Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI);
+ resetBundle();
+
+ return Success;
+}
+
void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) {
auto &MI = const_cast<MCInst &>(*I.getInst());
@@ -482,7 +543,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
unsigned Offset = 1;
bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
bool PrevVector = false;
- auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+ auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle);
auto i = Instructions.end() - 1;
for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
if (i == n)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 9030e43..f83e06c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) {
void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address,
StringRef Annot, const MCSubtargetInfo &STI,
raw_ostream &OS) {
- assert(HexagonMCInstrInfo::isBundle(*MI));
- assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
- assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
- HasExtender = false;
- for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
- MCInst const &MCI = *I.getInst();
- if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
- printInstruction(MCI.getOperand(1).getInst(), Address, OS);
- OS << '\v';
- HasExtender = false;
- printInstruction(MCI.getOperand(0).getInst(), Address, OS);
- } else
- printInstruction(&MCI, Address, OS);
- HasExtender = HexagonMCInstrInfo::isImmext(MCI);
- OS << "\n";
- }
-
- bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
- bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
- if (IsLoop0) {
- OS << (IsLoop1 ? " :endloop01" : " :endloop0");
- } else if (IsLoop1) {
- OS << " :endloop1";
+ if (HexagonMCInstrInfo::isDuplex(MII, *MI)) {
+ printInstruction(MI->getOperand(1).getInst(), Address, OS);
+ OS << '\v';
+ HasExtender = false;
+ printInstruction(MI->getOperand(0).getInst(), Address, OS);
+ } else {
+ printInstruction(MI, Address, OS);
}
+ HasExtender = HexagonMCInstrInfo::isImmext(*MI);
+ if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) ==
+ HexagonII::INST_PARSE_PACKET_END)
+ HasExtender = false;
}
void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 980df81..bfea50e 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -252,8 +252,21 @@ public:
std::string Buffer;
{
raw_string_ostream TempStream(Buffer);
- InstPrinter.printInst(&Inst, Address, "", STI, TempStream);
+ for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+ InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream);
+ TempStream << "\n";
+ }
+ }
+
+ std::string LoopString = "";
+ bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst);
+ bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst);
+ if (IsLoop0) {
+ LoopString += (IsLoop1 ? " :endloop01" : " :endloop0");
+ } else if (IsLoop1) {
+ LoopString += " :endloop1";
}
+
StringRef Contents(Buffer);
auto PacketBundle = Contents.rsplit('\n');
auto HeadTail = PacketBundle.first.split('\n');
@@ -275,9 +288,9 @@ public:
}
if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
- OS << "\n\t} :mem_noshuf" << PacketBundle.second;
+ OS << "\n\t} :mem_noshuf" << LoopString;
else
- OS << "\t}" << PacketBundle.second;
+ OS << "\t}" << LoopString;
}
void finish() override { finishAttributeSection(); }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 6c8e3da..23b4554 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -95,6 +95,11 @@ static const std::pair<MCPhysReg, int8_t> FixedCSRFIQCIInterruptMap[] = {
/* -21, -22, -23, -24 are reserved */
};
+/// Returns true if DWARF CFI instructions ("frame moves") should be emitted.
+static bool needsDwarfCFI(const MachineFunction &MF) {
+ return MF.needsFrameMoves();
+}
+
// For now we use x3, a.k.a gp, as pointer to shadow call stack.
// User should not use x3 in their asm.
static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB,
.addImm(-SlotSize)
.setMIFlag(MachineInstr::FrameSetup);
+ if (!needsDwarfCFI(MF))
+ return;
+
// Emit a CFI instruction that causes SlotSize to be subtracted from the value
// of the shadow stack pointer when unwinding past this frame.
char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true);
@@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB,
.addReg(SCSPReg)
.addImm(-SlotSize)
.setMIFlag(MachineInstr::FrameDestroy);
- // Restore the SCS pointer
- CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg);
+ if (needsDwarfCFI(MF)) {
+ // Restore the SCS pointer
+ CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg);
+ }
}
// Insert instruction to swap mscratchsw with sp
@@ -935,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
getUnmanagedCSI(MF, CSI).size());
CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+ bool NeedsDwarfCFI = needsDwarfCFI(MF);
// If libcalls are used to spill and restore callee-saved registers, the frame
// has two sections; the opaque section managed by the libcalls, and the
@@ -962,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign());
RVFI->setLibCallStackSize(LibCallFrameSize);
- CFIBuilder.buildDefCFAOffset(LibCallFrameSize);
- for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
- CFIBuilder.buildOffset(CS.getReg(),
- MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI) {
+ CFIBuilder.buildDefCFAOffset(LibCallFrameSize);
+ for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+ CFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
+ }
}
// FIXME (note copied from Lanai): This appears to be overallocating. Needs
@@ -996,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// could only be the next instruction.
++PossiblePush;
- // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)`
- // could be. The PUSH will also get its own CFI metadata for its own
- // modifications, which should come after the PUSH.
- CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup);
- PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount);
- for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI))
- PushCFIBuilder.buildOffset(CS.getReg(),
- MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI) {
+ // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)`
+ // could be. The PUSH will also get its own CFI metadata for its own
+ // modifications, which should come after the PUSH.
+ CFIInstBuilder PushCFIBuilder(MBB, PossiblePush,
+ MachineInstr::FrameSetup);
+ PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount);
+ for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI))
+ PushCFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
+ }
}
if (RVFI->isPushable(MF) && PossiblePush != MBB.end() &&
@@ -1017,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
PossiblePush->getOperand(1).setImm(StackAdj);
StackSize -= StackAdj;
- CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize);
- for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
- CFIBuilder.buildOffset(CS.getReg(),
- MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI) {
+ CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize);
+ for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+ CFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
+ }
}
// Allocate space on the stack if necessary.
@@ -1031,7 +1049,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
bool DynAllocation =
MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
if (StackSize != 0)
- allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
+ allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI,
NeedProbe, ProbeSize, DynAllocation,
MachineInstr::FrameSetup);
@@ -1049,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// Iterate over list of callee-saved registers and emit .cfi_offset
// directives.
- for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
- CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx()));
+ if (NeedsDwarfCFI)
+ for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
+ CFIBuilder.buildOffset(CS.getReg(),
+ MFI.getObjectOffset(CS.getFrameIdx()));
// Generate new FP.
if (hasFP(MF)) {
@@ -1069,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineInstr::FrameSetup, getStackAlign());
}
- CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize());
+ if (NeedsDwarfCFI)
+ CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize());
}
uint64_t SecondSPAdjustAmount = 0;
@@ -1080,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
"SecondSPAdjustAmount should be greater than zero");
allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
- getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
- ProbeSize, DynAllocation, MachineInstr::FrameSetup);
+ getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF),
+ NeedProbe, ProbeSize, DynAllocation,
+ MachineInstr::FrameSetup);
}
if (RVVStackSize) {
if (NeedProbe) {
allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
- MachineInstr::FrameSetup, !hasFP(MF),
- DynAllocation);
+ MachineInstr::FrameSetup,
+ NeedsDwarfCFI && !hasFP(MF), DynAllocation);
} else {
// We must keep the stack pointer aligned through any intermediate
// updates.
@@ -1097,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
MachineInstr::FrameSetup, getStackAlign());
}
- if (!hasFP(MF)) {
+ if (NeedsDwarfCFI && !hasFP(MF)) {
// Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
CFIBuilder.insertCFIInst(createDefCFAExpression(
*RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8));
}
std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
- emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
+ if (NeedsDwarfCFI)
+ emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
}
if (hasFP(MF)) {
@@ -1171,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF,
MachineInstr::FrameDestroy, getStackAlign());
StackSize = 0;
- CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
- .buildDefCFAOffset(CFAOffset);
+ if (needsDwarfCFI(MF))
+ CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
+ .buildDefCFAOffset(CFAOffset);
}
void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -1212,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn,
MachineInstr::FrameDestroy);
+ bool NeedsDwarfCFI = needsDwarfCFI(MF);
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount
@@ -1232,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
StackOffset::getScalable(RVVStackSize),
MachineInstr::FrameDestroy, getStackAlign());
- if (!hasFP(MF))
- CFIBuilder.buildDefCFA(SPReg, RealStackSize);
-
- emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
+ if (NeedsDwarfCFI) {
+ if (!hasFP(MF))
+ CFIBuilder.buildDefCFA(SPReg, RealStackSize);
+ emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
+ }
}
if (FirstSPAdjustAmount) {
@@ -1251,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
StackOffset::getFixed(SecondSPAdjustAmount),
MachineInstr::FrameDestroy, getStackAlign());
- if (!hasFP(MF))
+ if (NeedsDwarfCFI && !hasFP(MF))
CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount);
}
@@ -1272,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
getStackAlign());
}
- if (hasFP(MF))
+ if (NeedsDwarfCFI && hasFP(MF))
CFIBuilder.buildDefCFA(SPReg, RealStackSize);
// Skip to after the restores of scalar callee-saved registers
@@ -1295,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
}
// Recover callee-saved registers.
- for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
- CFIBuilder.buildRestore(CS.getReg());
+ if (NeedsDwarfCFI)
+ for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI))
+ CFIBuilder.buildRestore(CS.getReg());
if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) {
// Use available stack adjustment in pop instruction to deallocate stack
@@ -1315,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
auto NextI = next_nodbg(MBBI, MBB.end());
if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) {
++MBBI;
- CFIBuilder.setInsertPoint(MBBI);
+ if (NeedsDwarfCFI) {
+ CFIBuilder.setInsertPoint(MBBI);
- for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
- CFIBuilder.buildRestore(CS.getReg());
+ for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI))
+ CFIBuilder.buildRestore(CS.getReg());
- // Update CFA Offset. If this is a QCI interrupt function, there will be a
- // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise
- // getQCIInterruptStackSize() will be 0.
- CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize());
+ // Update CFA Offset. If this is a QCI interrupt function, there will
+ // be a leftover offset which is deallocated by `QC.C.MILEAVERET`,
+ // otherwise getQCIInterruptStackSize() will be 0.
+ CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize());
+ }
}
}
@@ -1812,7 +1841,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
// allocateStack.
bool DynAllocation =
MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
- allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF),
+ allocateStack(MBB, MI, MF, -Amount, -Amount,
+ needsDwarfCFI(MF) && !hasFP(MF),
/*NeedProbe=*/true, ProbeSize, DynAllocation,
MachineInstr::NoFlags);
} else {
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 38cc0ce..dd68a55 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -102,6 +102,56 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
return false;
}
+/// Do the common operand retrieval and validition required by the
+/// routines below.
+static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
+ Instruction *I, Value *&Ptr, Value *&Mask,
+ Value *&VL, Align &Alignment) {
+
+ IRBuilder<> Builder(I);
+ const DataLayout &DL = I->getDataLayout();
+ ElementCount EC = VTy->getElementCount();
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(LI->isSimple());
+ Ptr = LI->getPointerOperand();
+ Alignment = LI->getAlign();
+ assert(!Mask && "Unexpected mask on a load");
+ Mask = Builder.getAllOnesMask(EC);
+ VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ assert(SI->isSimple());
+ Ptr = SI->getPointerOperand();
+ Alignment = SI->getAlign();
+ assert(!Mask && "Unexpected mask on a store");
+ Mask = Builder.getAllOnesMask(EC);
+ VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC)
+ : Constant::getAllOnesValue(XLenTy);
+ return true;
+ }
+ auto *VPLdSt = cast<VPIntrinsic>(I);
+ assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
+ VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
+ "Unexpected intrinsic");
+ Ptr = VPLdSt->getMemoryPointerParam();
+ Alignment = VPLdSt->getPointerAlignment().value_or(
+ DL.getABITypeAlign(VTy->getElementType()));
+
+ assert(Mask && "vp.load and vp.store needs a mask!");
+
+ Value *WideEVL = VPLdSt->getVectorLengthParam();
+ // Conservatively check if EVL is a multiple of factor, otherwise some
+ // (trailing) elements might be lost after the transformation.
+ if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
+ return false;
+
+ auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
+ VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
+ return true;
+}
+
/// Lower an interleaved load into a vlsegN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -127,32 +177,8 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
Value *Ptr, *VL;
Align Alignment;
- if (auto *LI = dyn_cast<LoadInst>(Load)) {
- assert(LI->isSimple());
- Ptr = LI->getPointerOperand();
- Alignment = LI->getAlign();
- assert(!Mask && "Unexpected mask on a load\n");
- Mask = Builder.getAllOnesMask(VTy->getElementCount());
- VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
- } else {
- auto *VPLoad = cast<VPIntrinsic>(Load);
- assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
- "Unexpected intrinsic");
- Ptr = VPLoad->getMemoryPointerParam();
- Alignment = VPLoad->getPointerAlignment().value_or(
- DL.getABITypeAlign(VTy->getElementType()));
-
- assert(Mask && "vp.load needs a mask!");
-
- Value *WideEVL = VPLoad->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, DL, Factor))
- return false;
-
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
- }
+ if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+ return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
@@ -296,34 +322,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
Value *Ptr, *VL;
Align Alignment;
- if (auto *LI = dyn_cast<LoadInst>(Load)) {
- assert(LI->isSimple());
- Ptr = LI->getPointerOperand();
- Alignment = LI->getAlign();
- assert(!Mask && "Unexpected mask on a load\n");
- Mask = Builder.getAllOnesMask(ResVTy->getElementCount());
- VL = isa<FixedVectorType>(ResVTy)
- ? Builder.CreateElementCount(XLenTy, ResVTy->getElementCount())
- : Constant::getAllOnesValue(XLenTy);
- } else {
- auto *VPLoad = cast<VPIntrinsic>(Load);
- assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
- "Unexpected intrinsic");
- Ptr = VPLoad->getMemoryPointerParam();
- Alignment = VPLoad->getPointerAlignment().value_or(
- DL.getABITypeAlign(ResVTy->getElementType()));
-
- assert(Mask && "vp.load needs a mask!");
-
- Value *WideEVL = VPLoad->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
- return false;
-
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
- }
+ if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
+ return false;
Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
@@ -385,34 +385,8 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
Value *Ptr, *VL;
Align Alignment;
- if (auto *SI = dyn_cast<StoreInst>(Store)) {
- assert(SI->isSimple());
- Ptr = SI->getPointerOperand();
- Alignment = SI->getAlign();
- assert(!Mask && "Unexpected mask on a store");
- Mask = Builder.getAllOnesMask(InVTy->getElementCount());
- VL = isa<FixedVectorType>(InVTy)
- ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount())
- : Constant::getAllOnesValue(XLenTy);
- } else {
- auto *VPStore = cast<VPIntrinsic>(Store);
- assert(VPStore->getIntrinsicID() == Intrinsic::vp_store &&
- "Unexpected intrinsic");
- Ptr = VPStore->getMemoryPointerParam();
- Alignment = VPStore->getPointerAlignment().value_or(
- DL.getABITypeAlign(InVTy->getElementType()));
-
- assert(Mask && "vp.store needs a mask!");
-
- Value *WideEVL = VPStore->getVectorLengthParam();
- // Conservatively check if EVL is a multiple of factor, otherwise some
- // (trailing) elements might be lost after the transformation.
- if (!isMultipleOfN(WideEVL, DL, Factor))
- return false;
-
- auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
- VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
- }
+ if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment))
+ return false;
Type *PtrTy = Ptr->getType();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d91ea1ea..6281124 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1323,11 +1323,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
}
- if (Subtarget.hasGFNI()) {
+ if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
+
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -32694,7 +32698,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasXOP() && !VT.is512BitVector())
return LowerBITREVERSE_XOP(Op, DAG);
- assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
+ assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
+ "SSSE3 or GFNI required for BITREVERSE");
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 636bd81..9e318b0 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3790,6 +3790,11 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
continue;
}
+ // Do not consider uses inside lifetime intrinsics. These are not
+ // actually materialized.
+ if (UserInst->isLifetimeStartOrEnd())
+ continue;
+
std::pair<size_t, Immediate> P =
getUse(S, LSRUse::Basic, MemAccessTy());
size_t LUIdx = P.first;
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 6226596..40dc02c 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -232,13 +232,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
NewAI->setSwiftError(Info.AI->isSwiftError());
NewAI->copyMetadata(*Info.AI);
- Value *NewPtr = NewAI;
-
- // TODO: Remove when typed pointers dropped
- if (Info.AI->getType() != NewAI->getType())
- NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI->getIterator());
-
- Info.AI->replaceAllUsesWith(NewPtr);
+ Info.AI->replaceAllUsesWith(NewAI);
Info.AI->eraseFromParent();
Info.AI = NewAI;
}
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index 0c56e1b..d428b6a 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -395,3 +395,37 @@ define i64 @freeze_array() {
%t1 = add i64 %v1, %v2
ret i64 %t1
}
+
+define <8 x i16> @freeze_abdu(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: freeze_abdu:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uaba v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: freeze_abdu:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: uabd v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
+ %d = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %f = freeze <8 x i16> %d
+ %r = add <8 x i16> %a, %f
+ ret <8 x i16> %r
+}
+
+define <8 x i16> @freeze_abds(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-SD-LABEL: freeze_abds:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: saba v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: freeze_abds:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sabd v1.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT: ret
+ %d = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b)
+ %f = freeze <8 x i16> %d
+ %r = add <8 x i16> %a, %f
+ ret <8 x i16> %r
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
index 4153f0b..9698f1a 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll
@@ -231,3 +231,27 @@ define <vscale x 8 x i64> @sload_8i8_8i64(ptr %a) {
%aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i64>
ret <vscale x 8 x i64> %aext
}
+
+; Ensure we don't try to promote a predicate load to a sign-extended load.
+define <vscale x 16 x i8> @sload_16i1_16i8(ptr %addr) {
+; CHECK-LABEL: sload_16i1_16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr p0, [x0]
+; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT: ret
+ %load = load <vscale x 16 x i1>, ptr %addr
+ %zext = sext <vscale x 16 x i1> %load to <vscale x 16 x i8>
+ ret <vscale x 16 x i8> %zext
+}
+
+; Ensure we don't try to promote a predicate load to a zero-extended load.
+define <vscale x 16 x i8> @zload_16i1_16i8(ptr %addr) {
+; CHECK-LABEL: zload_16i1_16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr p0, [x0]
+; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT: ret
+ %load = load <vscale x 16 x i1>, ptr %addr
+ %zext = zext <vscale x 16 x i1> %load to <vscale x 16 x i8>
+ ret <vscale x 16 x i8> %zext
+}
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 50d20e9..6cb236d 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
@@ -789,11 +790,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index cb2f0f2..0d5f538 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_4: ; %end
@@ -6549,319 +6549,314 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
@@ -15418,63 +15413,63 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -15488,146 +15483,144 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15641,720 +15634,746 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -42137,64 +42156,64 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -42222,50 +42241,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -42309,50 +42328,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB36_4: ; %end
@@ -42360,319 +42379,314 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
@@ -52196,63 +52210,63 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -52266,146 +52280,144 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -52419,720 +52431,746 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -77900,64 +77938,64 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -77985,50 +78023,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -78097,50 +78135,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB56_4: ; %end
@@ -78148,319 +78186,314 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
@@ -87027,63 +87060,63 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -87097,146 +87130,144 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -87250,720 +87281,746 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -111743,64 +111800,64 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -111828,50 +111885,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -111915,50 +111972,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB72_4: ; %end
@@ -111966,319 +112023,314 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
@@ -121787,63 +121839,63 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -121857,146 +121909,144 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -122010,720 +122060,746 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -160055,116 +160131,116 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
@@ -160187,341 +160263,338 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v94.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.h, v16.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v24.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v26.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.h
; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v17
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v17, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v55.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v36, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v70.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v34, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v37, v51, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v17, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v71.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v80.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v70
+; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v83, v33, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v83.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v84.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v71
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v82.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_cndmask_b32 v84, v19, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v20, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v80
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v83.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20
; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v82
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v84
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v22
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v85.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v86.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v33, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v86.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v97.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v96.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v87
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v85
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v98.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v97.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v26, v38 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v26, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v26, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v98
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v25, v39, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v96
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v100.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v25, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v99.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v100
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v99
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
@@ -160529,21 +160602,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25
; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1
@@ -160556,45 +160630,44 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v115.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v114.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1
@@ -160607,10 +160680,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v32, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v115
+; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v114
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36
@@ -160622,9 +160695,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v131.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v144, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -160640,252 +160713,240 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v144.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v33, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v144, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v146.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v34, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v162.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v164.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v147.h
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v38 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v149
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v144
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v165.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v35, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v164.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v6, v33, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v6, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v165
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v5, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v161
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v179.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v178.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v35, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v179
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v178
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v47, v35, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v45, v7, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v47.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v7, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v46.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v45.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v44.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v42
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v41
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v38, v50, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v59.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v74, v35, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v61.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v35, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v60, v48, v52 :: v_dual_add_f32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v57, v48, v52, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v60
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v57
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v74.h
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v12
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_lshlrev_b32 v16, 16, v16
; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v14, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v77.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v78.h
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v16, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v94, v35, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v35, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v93, v13, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v91, v13, v49, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v94.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v39, v51, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v104.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v76
+; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v77
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v104.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v91.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v95.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v93.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14
; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
@@ -160905,332 +160966,327 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7
; GFX11-TRUE16-NEXT: .LBB90_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v106.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v107.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v107.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v106.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v78.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v105.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v164.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v105.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v91.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v95.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v93.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v180.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v90.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v164.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v180.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v90.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v144.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v165.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v47.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v76.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v58.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v75.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v161.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v179.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v6.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v46.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v75.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v178.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v63.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v179.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v62.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v74.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v73.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v59.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v8.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v45.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v43.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v89.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v41.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v42.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v89.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v40.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v61.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v183.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v16, v14
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v42.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v43.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v60.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v181.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v11.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v104.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v176.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v166.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v167.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v57.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v78.h
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v94.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v167.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v59.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v182.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v77.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v20, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v76.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v91.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v92.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v73.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v79.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v72.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v12.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v77.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v95.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v18, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v93.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v149.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v18, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v79.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v92.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v18, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v74.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v46.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v63.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v62.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v47.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v58.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v45.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v19, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v40.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v44.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v177.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v182.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v41.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v183.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v176.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v181.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v177.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v22, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v163.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v166.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v28, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v26, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v28
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v145.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v25, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v135.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v25, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v28, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v128.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v13.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v13.h, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v32, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v34
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v14
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[19:22], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:112
; GFX11-TRUE16-NEXT: s_clause 0x1f
; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:16
@@ -185249,64 +185305,64 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -185331,52 +185387,52 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
@@ -185436,371 +185492,366 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v68
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v69
-; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v51, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v68
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v51, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v65, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
@@ -208007,64 +208058,64 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -208089,52 +208140,52 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
@@ -208194,371 +208245,366 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v68
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v69
-; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v67
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v51, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v68
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v65
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v51, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v65, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v64
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 34d7ed9..3e96ab1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -2675,79 +2675,76 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -4122,18 +4119,18 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -4147,107 +4144,103 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -7140,79 +7133,76 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB46_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -8603,18 +8593,18 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -8628,107 +8618,103 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11253,79 +11239,76 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB66_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -12700,18 +12683,18 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -12725,107 +12708,103 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -14952,79 +14931,76 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0
; GFX11-TRUE16-NEXT: .LBB82_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -16407,18 +16383,18 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -16432,107 +16408,103 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -18254,83 +18226,83 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v11, v12 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v5
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v6
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -19840,18 +19812,18 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -19865,107 +19837,103 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -21172,79 +21140,79 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v1, v7, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v1, v7 :: v_dual_and_b32 v7, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v11, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v7 :: v_dual_add_f32 v7, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v13, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v7, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v7, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v9, v4
; GFX11-TRUE16-NEXT: .LBB102_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -22758,18 +22726,18 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -22783,107 +22751,103 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -23876,87 +23840,92 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v14, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v13, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v11
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v14, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v0, v9, v13, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_and_b32 v5, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v7, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v5, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v14, v10, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v9, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v1, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v13, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v7, v2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v9, v1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v9, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
@@ -24976,18 +24945,18 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -25001,107 +24970,103 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index 2c78e34..5344095 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -659,7 +659,8 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
@@ -1132,7 +1133,8 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index a19567b..f8ffaa4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6296,33 +6296,32 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -6334,188 +6333,194 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -13330,33 +13335,32 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -13368,188 +13372,194 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -19882,33 +19892,32 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -19920,188 +19929,194 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -25924,33 +25939,32 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -25962,188 +25976,194 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index e773b54..0cefbc1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -2966,20 +2966,20 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -2995,17 +2995,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow
@@ -3029,17 +3029,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_4: ; %end
@@ -3047,100 +3047,105 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8:
@@ -5033,50 +5038,48 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3
@@ -5091,228 +5094,243 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9933,20 +9951,20 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -9962,17 +9980,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow
@@ -9992,17 +10010,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB32_4: ; %end
@@ -10010,100 +10028,105 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8:
@@ -12014,50 +12037,48 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3
@@ -12072,228 +12093,243 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16322,20 +16358,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -16351,17 +16387,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow
@@ -16385,17 +16421,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_4: ; %end
@@ -16403,100 +16439,105 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8:
@@ -22438,20 +22479,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -22467,17 +22508,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow
@@ -22501,17 +22542,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB60_4: ; %end
@@ -22519,100 +22560,105 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8:
@@ -28813,50 +28859,50 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3
@@ -28871,228 +28917,243 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v23.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v22.h, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v23.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v21.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v24.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v26.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v16.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v17.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -30847,20 +30908,20 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -30876,17 +30937,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow
@@ -30905,17 +30966,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB74_4: ; %end
@@ -30923,100 +30984,105 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8:
@@ -32944,50 +33010,50 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3
@@ -33002,228 +33068,243 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2
; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v23.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v22.h, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v23.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v21.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v24.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v26.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v16.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v17.h, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -34993,20 +35074,20 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -35022,17 +35103,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow
@@ -35059,17 +35140,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB78_4: ; %end
@@ -35077,100 +35158,105 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index bfed1f4..48c9b87 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -2273,18 +2273,19 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -2294,16 +2295,17 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4504,8 +4506,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -4520,18 +4522,19 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2
; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true
@@ -4541,16 +4544,17 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6463,8 +6467,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -6479,18 +6483,19 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
@@ -6500,16 +6505,17 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8110,8 +8116,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -8126,18 +8132,19 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
@@ -8147,16 +8154,17 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9471,8 +9479,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -9487,18 +9495,19 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2
; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true
@@ -9508,16 +9517,17 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10183,8 +10193,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -10199,18 +10209,19 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true
@@ -10220,16 +10231,17 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 45e205b..68312b8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -145,37 +145,36 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
; GFX11-TRUE16-NEXT: .LBB0_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -797,40 +796,40 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2
; GFX11-TRUE16-NEXT: .LBB4_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index d524514..5aac06a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -8768,32 +8768,32 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -8812,26 +8812,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow
@@ -8864,26 +8864,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB24_4: ; %end
@@ -8891,151 +8891,156 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -12465,13 +12470,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
@@ -12487,81 +12492,83 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -12574,366 +12581,384 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -23563,32 +23588,32 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -23607,26 +23632,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow
@@ -23651,26 +23676,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_4: ; %end
@@ -23678,151 +23703,156 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -27383,13 +27413,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
@@ -27405,81 +27435,83 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -27492,366 +27524,384 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -37866,32 +37916,32 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -37910,26 +37960,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow
@@ -37967,26 +38017,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB68_4: ; %end
@@ -37994,151 +38044,156 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -41573,13 +41628,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
@@ -41595,81 +41650,83 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -41682,366 +41739,384 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -51220,32 +51295,32 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -51264,26 +51339,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow
@@ -51308,26 +51383,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB84_4: ; %end
@@ -51335,151 +51410,156 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -54909,13 +54989,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
@@ -54931,81 +55011,83 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -55018,366 +55100,384 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -64473,32 +64573,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -64517,26 +64617,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow
@@ -64569,26 +64669,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB96_4: ; %end
@@ -64596,151 +64696,156 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -76596,32 +76701,32 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -76640,26 +76745,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow
@@ -76692,26 +76797,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB104_4: ; %end
@@ -76719,151 +76824,156 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -85582,58 +85692,58 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -85648,297 +85758,304 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v6.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.h
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v21, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v27
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v17, v23, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22
; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v19, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v6, v17 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v33
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v34.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v6, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v31
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v5, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v36
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v35.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v7, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v7, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v38.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v23
-; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v39 :: v_dual_cndmask_b32 v52, v22, v37
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10
+; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v22, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v54.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v24, v50, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v52
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 24, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v23
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v21, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21
@@ -85950,42 +86067,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v13, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v13, v25, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v23, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v23, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v85.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v86.h
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v70.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v67
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v66
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v84.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v82.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v85.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v82
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v81
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9
; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13
; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7
; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
@@ -85994,160 +86111,159 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v24.l
; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v22, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index a40ee16..6fe6665 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2160,46 +2160,47 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB22_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -3064,12 +3065,13 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -3083,66 +3085,61 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5307,46 +5304,47 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB46_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -6216,12 +6214,13 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -6235,66 +6234,61 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8166,46 +8160,47 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB66_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -9068,12 +9063,13 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -9087,66 +9083,61 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10698,46 +10689,47 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4
; GFX11-TRUE16-NEXT: .LBB82_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -11611,12 +11603,13 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -11630,66 +11623,61 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -12867,49 +12855,49 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v0, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v3
; GFX11-TRUE16-NEXT: .LBB94_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -13841,12 +13829,13 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -13860,66 +13849,61 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -14696,46 +14680,46 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX11-TRUE16-NEXT: .LBB102_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -15671,12 +15655,13 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -15690,66 +15675,61 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16347,42 +16327,41 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v8.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v0 :: v_dual_lshlrev_b32 v0, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v12, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v0 :: v_dual_add_f32 v0, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v9.l
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v3, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9]
@@ -16987,12 +16966,13 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -17006,66 +16986,61 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 5163539..e5245f7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -1102,15 +1102,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -1126,76 +1126,79 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2084,62 +2087,57 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v11, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: .LBB10_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -4243,15 +4241,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -4267,76 +4265,79 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5229,62 +5230,57 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v11, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v11, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: .LBB26_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -6889,16 +6885,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -6914,77 +6910,79 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2
; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -7783,67 +7781,68 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v13.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v11, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v11, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v0, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v13, v14, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v5, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v7, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v2, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, 0x7fc07fc0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.h
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v7, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v5, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10
; GFX11-TRUE16-NEXT: .LBB38_4: ; %end
@@ -8652,16 +8651,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -8677,77 +8676,79 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2
; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10064,16 +10065,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -10089,77 +10090,79 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2
; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11446,59 +11449,61 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v8, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v3
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v8, v12 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v12 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v5
; GFX11-TRUE16-NEXT: .LBB48_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -12390,64 +12395,66 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v4, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v9, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v11, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v11, v12, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v0, 16, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v4
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v3
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v4
; GFX11-TRUE16-NEXT: .LBB52_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll
new file mode 100644
index 0000000..535f05bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
+
+@global_smem = external local_unnamed_addr addrspace(1) global [0 x i8], align 16
+
+define amdgpu_kernel void @v_atomicrmw_fadd_bf16(ptr addrspace(1) %out, i1 %in, ptr addrspace(1) %ptr) #0 {
+; GFX11-TRUE16-LABEL: v_atomicrmw_fadd_bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] offset:4
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, -4
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, 0xffff, s2
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-TRUE16-NEXT: s_not_b32 s3, s4
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT: .p2align 6
+; GFX11-TRUE16-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v0, v4, v[0:1], s[0:1] glc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_gl1_inv
+; GFX11-TRUE16-NEXT: buffer_gl0_inv
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_atomicrmw_fadd_bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1] offset:4
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, -4
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s3
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, 0xffff, s2
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-FAKE16-NEXT: s_not_b32 s3, s4
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT: .p2align 6
+; GFX11-FAKE16-NEXT: .LBB0_1: ; %atomicrmw.start
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0
+; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_gl1_inv
+; GFX11-FAKE16-NEXT: buffer_gl0_inv
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-FAKE16-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
+ %load = load <4 x bfloat>, ptr addrspace(1) %in.gep
+ %extract1 = extractelement <4 x bfloat> %load, i64 3
+ %fadd = atomicrmw fadd ptr addrspace(1) %out, bfloat %extract1 syncscope("agent") acq_rel
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 2bdf994..cd6d741 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -9082,17 +9082,19 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_fadd_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -13318,9 +13320,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_0:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13413,9 +13416,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_1:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13515,17 +13519,19 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_fsub_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -14275,17 +14281,19 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_fmul_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -18568,32 +18576,34 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_fdiv_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
-; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT: v_div_scale_f32 v1, null, v0, v0, v2
+; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v2, v0, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v1
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v3, 1.0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_fma_f32 v4, -v1, v3, 1.0
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v3
-; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v5, v3
+; GFX11TRUE16-NEXT: v_fma_f32 v6, -v1, v4, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v6, v3
+; GFX11TRUE16-NEXT: v_fma_f32 v1, -v1, v4, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fma_f32 v2, -v2, v4, v5
-; GFX11TRUE16-NEXT: v_div_fmas_f32 v2, v2, v3, v4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX11TRUE16-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v1, v0, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -19018,17 +19028,19 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_minnum_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -23270,17 +23282,19 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX11TRUE16-LABEL: v_maxnum_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -27591,11 +27605,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_sqrt_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1
+; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_sqrt_f32_e32 v1, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -27730,9 +27745,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GFX11TRUE16-LABEL: v_ldexp_bf16_i32:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v2, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -27836,17 +27852,18 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GFX11TRUE16-LABEL: v_frexp_bf16_i16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11TRUE16-NEXT: v_frexp_mant_f32_e32 v0, v1
+; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -28019,11 +28036,12 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_log_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -28177,13 +28195,14 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_log2_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
@@ -28367,11 +28386,12 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_log10_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
@@ -28580,25 +28600,26 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_exp_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1
-; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1
+; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v1
+; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x3fb8aa3b, v1, -v0
+; GFX11TRUE16-NEXT: v_rndne_f32_e32 v3, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1
+; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v1, 0x32a5705f, v2
+; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v3
+; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -28744,13 +28765,14 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_exp2_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
@@ -28937,25 +28959,26 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_exp10_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
-; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1
-; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x40549a78, v1
+; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v1
+; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x40549a78, v1, -v0
+; GFX11TRUE16-NEXT: v_rndne_f32_e32 v3, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2
-; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3
-; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1
+; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v1, 0x33979a37, v2
+; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v3
+; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -29066,9 +29089,10 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_ceil_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29163,9 +29187,10 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_trunc_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29260,9 +29285,10 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_rint_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29357,9 +29383,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_nearbyint_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29483,16 +29510,17 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_round_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_trunc_f32_e32 v1, v0
-; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v0, v1
+; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1
+; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v1, v0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v2, v1
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -29594,9 +29622,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_roundeven_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29691,9 +29720,10 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_floor_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29786,9 +29816,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GFX11TRUE16-LABEL: v_canonicalize_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v1, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -29916,15 +29947,27 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_oeq_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_oeq_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_oeq_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp oeq bfloat %a, %b
ret i1 %op
}
@@ -29979,15 +30022,27 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_ogt_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ogt_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ogt_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ogt bfloat %a, %b
ret i1 %op
}
@@ -30042,15 +30097,27 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_oge_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_oge_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_oge_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp oge bfloat %a, %b
ret i1 %op
}
@@ -30105,15 +30172,27 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_olt_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_olt_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_olt_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp olt bfloat %a, %b
ret i1 %op
}
@@ -30168,15 +30247,27 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_ole_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ole_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ole_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ole bfloat %a, %b
ret i1 %op
}
@@ -30231,15 +30322,27 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_one_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_one_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_one_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp one bfloat %a, %b
ret i1 %op
}
@@ -30294,15 +30397,27 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_uno_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_uno_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_uno_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp uno bfloat %a, %b
ret i1 %op
}
@@ -30357,15 +30472,27 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_ueq_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ueq_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ueq_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ueq bfloat %a, %b
ret i1 %op
}
@@ -30420,15 +30547,27 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_ugt_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ugt_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ugt_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ugt bfloat %a, %b
ret i1 %op
}
@@ -30483,15 +30622,27 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_uge_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_uge_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_uge_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp uge bfloat %a, %b
ret i1 %op
}
@@ -30546,15 +30697,27 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_ult_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ult_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ult_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ult bfloat %a, %b
ret i1 %op
}
@@ -30609,15 +30772,27 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_ule_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_ule_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_ule_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp ule bfloat %a, %b
ret i1 %op
}
@@ -30672,15 +30847,27 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fcmp_une_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fcmp_une_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11TRUE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v1, v2
+; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fcmp_une_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fcmp une bfloat %a, %b
ret i1 %op
}
@@ -30763,13 +30950,22 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fptosi_bf16_to_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v1
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i16
ret i16 %op
}
@@ -31144,13 +31340,22 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fptosi_bf16_to_i32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i32:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v1
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i32:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i32
ret i32 %op
}
@@ -31494,27 +31699,50 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fptosi_bf16_to_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_trunc_f32_e32 v0, v0
-; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_floor_f32_e32 v1, v1
-; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
-; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2
-; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3
-; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i64:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1
+; GFX11TRUE16-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_floor_f32_e32 v1, v1
+; GFX11TRUE16-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v2
+; GFX11TRUE16-NEXT: v_xor_b32_e32 v1, v1, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_xor_b32_e32 v0, v0, v3
+; GFX11TRUE16-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i64:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11FAKE16-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_floor_f32_e32 v1, v1
+; GFX11FAKE16-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v2
+; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, v0, v3
+; GFX11FAKE16-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = fptosi bfloat %x to i64
ret i64 %op
}
@@ -42575,18 +42803,21 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX11TRUE16-LABEL: v_fma_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -43457,26 +43688,30 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX11TRUE16-LABEL: v_fmuladd_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 348862d..f4b432d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -5100,55 +5100,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5257,53 +5258,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5619,48 +5621,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5773,46 +5776,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -6124,15 +6128,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7
; GFX12-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -6146,39 +6150,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-TRUE16-NEXT: ; %bb.2:
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX12-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v6, v6, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
-; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -6193,14 +6196,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -6208,7 +6211,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6384,16 +6387,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7
; GFX11-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -6405,39 +6408,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-TRUE16-NEXT: ; %bb.2:
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
-; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -6451,14 +6453,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -6468,7 +6470,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ab867b0..6f1675e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -4228,55 +4228,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4385,53 +4386,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4749,48 +4751,49 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4903,46 +4906,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5256,15 +5260,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -5278,39 +5282,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-TRUE16-NEXT: ; %bb.2:
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
-; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -5325,14 +5328,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5340,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5516,16 +5519,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -5537,39 +5540,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-TRUE16-NEXT: ; %bb.2:
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
-; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -5583,14 +5585,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -5600,7 +5602,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 1a25904..acb27be 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -4228,55 +4228,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4385,53 +4386,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4749,48 +4751,49 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
; GFX12-TRUE16-NEXT: s_not_b32 s6, s5
; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -4903,46 +4906,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4
+; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4
; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
@@ -5256,15 +5260,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -5278,39 +5282,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-TRUE16-NEXT: ; %bb.2:
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v10
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v8
+; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
-; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -5325,14 +5328,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -5340,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -5516,16 +5519,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -5537,39 +5540,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
; GFX11-TRUE16-NEXT: ; %bb.2:
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5
-; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -5583,14 +5585,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -5600,7 +5602,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index a1aef8d..da49140 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2748,100 +2748,101 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, v10.l, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 1, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, v8.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 3, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, v6.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 2, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 1, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v4.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 2, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, v6.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v6.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 1, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v4.l, 1
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 1
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v28.l, 1
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v26.l, 1
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, v3.l, 15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 1, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 2, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 1, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 3, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v24.l, 1
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 1, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v22.l, 1
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v20.l, 1
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, v18.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 1, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, v16.l, 1
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, v14.l, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 1, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, v12.l, 1
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v30.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 3, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 2, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 2, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 2, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 3, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 2, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 3, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v4.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v5.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v12.h, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, v26.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, v22.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 2, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v12.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v5.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v7.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, v3.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 3, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 2, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 3, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 2, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.h, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v8.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 3, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 2, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v28.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v23.l, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 15
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, v3.l, 15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 4, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v4.l, 15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 12, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v30.h, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v24.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v14.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 12, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v20.h, 15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 4, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 15
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 745e047..86e890b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1771,33 +1771,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff00, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.h
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x900, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x900, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.h
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x900, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 738bad7..f26b720 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -2811,20 +2811,20 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2852,20 +2852,20 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2945,20 +2945,20 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3f00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2986,20 +2986,20 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3f00, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3105,34 +3105,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test3:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x4000, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x4000, s0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test3:
@@ -3170,34 +3170,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test3:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x4000, s0
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x4000, s0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test3:
@@ -3314,34 +3314,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test4:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x3f00, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x3f00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x3f00, s0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test4:
@@ -3379,34 +3379,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test4:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x3f00, s0
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x3f00, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x3f00, s0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
-; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test4:
@@ -3498,20 +3498,20 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4100
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3539,20 +3539,20 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4100
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3634,20 +3634,20 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4040
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc100, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3675,20 +3675,20 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4040
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc100, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3769,20 +3769,20 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc080
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4100, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3810,20 +3810,20 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc080
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4100, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -3902,12 +3902,13 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test8:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -3940,12 +3941,13 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test8:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
@@ -4033,20 +4035,20 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc200
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc180, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4074,20 +4076,20 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc200
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc180, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4169,20 +4171,20 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xdb80
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xe000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4210,20 +4212,20 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xdb80
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xe000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4305,20 +4307,20 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4c00
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3480, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4346,20 +4348,20 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4c00
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3480, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 45fe2d0..85e56a2 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -77,11 +77,20 @@ define i32 @divergent_vec_0_i16(i16 %a) {
; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: divergent_vec_0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x i16> poison, i16 0, i32 0
%vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
%val = bitcast <2 x i16> %vec to i32
@@ -160,11 +169,20 @@ define i32 @divergent_vec_i16_0(i16 %a) {
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: divergent_vec_i16_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_i16_0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_i16_0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x i16> poison, i16 %a, i32 0
%vec = insertelement <2 x i16> %tmp, i16 0, i32 1
%val = bitcast <2 x i16> %vec to i32
@@ -243,11 +261,20 @@ define float @divergent_vec_f16_0(half %a) {
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: divergent_vec_f16_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_f16_0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_f16_0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tmp = insertelement <2 x half> poison, half %a, i32 0
%vec = insertelement <2 x half> %tmp, half 0.0, i32 1
%val = bitcast <2 x half> %vec to float
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 6cd4399..d8f81db 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -624,31 +624,30 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_mul_f32 v2, v1, v2 :: v_dual_lshlrev_b32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
@@ -809,35 +808,35 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s4, 16
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_and_b32 s2, s4, 0xffff0000
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, s2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, s2, v1
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s4, 16
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
@@ -988,34 +987,36 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mul_f32 v0, 4.0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 2.0, v1
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 4.0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 2.0, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
-; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v1, off dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_extract_fabs_fold_v2bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 8581e4d03..8c7d5cf 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -11974,7 +11974,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -11988,20 +11988,22 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12121,7 +12123,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -12136,19 +12138,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12425,34 +12429,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12578,12 +12582,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12594,19 +12597,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12891,34 +12896,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13045,12 +13050,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13061,19 +13065,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13355,45 +13361,45 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13505,45 +13511,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13806,45 +13813,45 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13957,45 +13964,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14259,27 +14267,28 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14379,27 +14388,28 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14629,32 +14639,33 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -14744,33 +14755,34 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14988,7 +15000,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -15002,18 +15014,20 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -15130,7 +15144,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -15145,17 +15159,19 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -15424,34 +15440,34 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -15579,12 +15595,11 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -15595,19 +15610,21 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -15891,46 +15908,46 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -16043,45 +16060,46 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 883063b..56ad91d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -9836,7 +9836,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9850,20 +9850,22 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9983,7 +9985,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9998,19 +10000,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10288,34 +10292,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10441,12 +10445,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10457,19 +10460,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10755,34 +10760,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10909,12 +10914,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10925,19 +10929,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -11220,7 +11226,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -11234,18 +11240,20 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -11362,7 +11370,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -11377,17 +11385,19 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -11654,45 +11664,45 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11804,45 +11814,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12106,45 +12117,45 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -12257,45 +12268,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12560,27 +12572,28 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12680,27 +12693,28 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12931,32 +12945,33 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13046,33 +13061,34 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13294,34 +13310,34 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13449,12 +13465,11 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13465,19 +13480,21 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13762,46 +13779,46 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13914,45 +13931,46 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index c603421..f0083bd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -9836,7 +9836,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9850,20 +9850,22 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9983,7 +9985,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9998,19 +10000,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10288,34 +10292,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10441,12 +10445,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10457,19 +10460,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10755,34 +10760,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10909,12 +10914,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10925,19 +10929,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -11220,7 +11226,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -11234,18 +11240,20 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -11362,7 +11370,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -11377,17 +11385,19 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -11654,45 +11664,45 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11804,45 +11814,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12106,45 +12117,45 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -12257,45 +12268,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12560,27 +12572,28 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12680,27 +12693,28 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12931,32 +12945,33 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13046,33 +13061,34 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13294,34 +13310,34 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13449,12 +13465,11 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13465,19 +13480,21 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13762,46 +13779,46 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13914,45 +13931,46 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index c987effe..3ee0bb2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -9419,7 +9419,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9433,20 +9433,22 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9566,7 +9568,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9581,19 +9583,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9870,34 +9874,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10023,12 +10027,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10039,19 +10042,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10336,34 +10341,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10490,12 +10495,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10506,19 +10510,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10800,7 +10806,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10814,18 +10820,20 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -10942,7 +10950,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10957,17 +10965,19 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -11233,45 +11243,45 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11383,45 +11393,46 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11684,45 +11695,45 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11835,45 +11846,46 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12137,27 +12149,28 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12257,27 +12270,28 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12507,32 +12521,33 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -12622,33 +12637,34 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12869,34 +12885,34 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13024,12 +13040,11 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13040,19 +13055,21 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13336,46 +13353,46 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13488,45 +13505,46 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index 2f08931..8784352 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1872,63 +1872,48 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX11-SDAG-TRUE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -1985,74 +1970,59 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v1
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2505,114 +2475,88 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v8
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v4, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0:
@@ -2710,123 +2654,103 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> %
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v8
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v4, v5
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index 0742ac7..bc85dc2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -69,21 +69,16 @@ define bfloat @v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum(bfloat %a) #1 {
; GFX11-SDAG-TRUE16-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4000, v0.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4000, v1.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 4.0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4080, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4080, v1.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
%max = call bfloat @llvm.maximumnum.bf16(bfloat %a, bfloat 2.0)
%med = call bfloat @llvm.minimumnum.bf16(bfloat %max, bfloat 4.0)
@@ -196,35 +191,26 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e64 s0, v2, v2
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4000, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x4000, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e64 s0, v1, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4000, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x4000, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, 2.0, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4000, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 4.0, v2
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 4.0, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4080, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 4.0, v2
; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4080, v0.l, s0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
%max = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> splat (bfloat 2.0))
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index 969c6c3..7b2d793 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1874,63 +1874,48 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX11-SDAG-TRUE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -1987,74 +1972,59 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v1
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2510,114 +2480,88 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v8
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v4, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v3
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0:
@@ -2715,123 +2659,103 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> %
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v7
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v8
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v4, v5
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.h
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v3
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index a901d7f..f8ff8ef 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1109,18 +1109,19 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1191,19 +1192,20 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1289,23 +1291,24 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v0.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v8i8:
@@ -1419,46 +1422,47 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v16i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v8.h, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v8.h, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v10.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v11
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v4, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v12
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[6:9], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v16i8:
@@ -1654,85 +1658,84 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v7.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v8.h, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v13.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v14.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v15.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v11.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v8.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v8
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v9.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v5.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v17, v18
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v11, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32
; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index dd7f183..facc91a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -4896,22 +4896,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
@@ -5156,18 +5157,22 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off
@@ -5175,9 +5180,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
@@ -5440,36 +5442,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[0:1], off
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v5
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
@@ -5911,81 +5912,83 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v2.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v13
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v3, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v7, v13
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v13
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[7:10], off
-; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[3:6], off
+; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[6:9], off
+; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[2:5], off
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 100a560..1f74fbd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -12320,7 +12320,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -12334,20 +12334,22 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12467,7 +12469,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -12482,19 +12484,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12821,34 +12825,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12974,12 +12978,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12990,19 +12993,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13339,34 +13344,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13493,12 +13498,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13509,19 +13513,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13855,7 +13861,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -13869,18 +13875,20 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -13997,7 +14005,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -14012,17 +14020,19 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -14337,45 +14347,45 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -14487,45 +14497,46 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -14838,45 +14849,45 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -14989,45 +15000,46 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -15341,27 +15353,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -15461,27 +15474,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -15750,32 +15764,33 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -15865,33 +15880,34 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -16149,34 +16165,34 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -16304,12 +16320,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -16320,19 +16335,21 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -16668,46 +16685,46 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -16820,45 +16837,46 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index faa3ee6..faa74fe 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -8755,20 +8755,22 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -8888,7 +8890,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -8903,19 +8905,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9244,34 +9248,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9397,12 +9401,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9413,19 +9416,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9764,34 +9769,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9918,12 +9923,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9934,19 +9938,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10282,7 +10288,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10296,18 +10302,20 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -10424,7 +10432,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10439,17 +10447,19 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -10766,45 +10776,45 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -10916,45 +10926,46 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11269,45 +11280,45 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11420,45 +11431,46 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11774,27 +11786,28 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -11894,27 +11907,28 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -12185,32 +12199,33 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -12300,33 +12315,34 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12586,34 +12602,34 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12741,12 +12757,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12757,19 +12772,21 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13107,46 +13124,46 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13259,45 +13276,46 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index cb66f85..a46b012 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -8755,20 +8755,22 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -8888,7 +8890,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -8903,19 +8905,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9244,34 +9248,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9397,12 +9401,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9413,19 +9416,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9764,34 +9769,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9918,12 +9923,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9934,19 +9938,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10282,7 +10288,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10296,18 +10302,20 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -10424,7 +10432,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10439,17 +10447,19 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -10766,45 +10776,45 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -10916,45 +10926,46 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11269,45 +11280,45 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11420,45 +11431,46 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11774,27 +11786,28 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -11894,27 +11907,28 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -12185,32 +12199,33 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -12300,33 +12315,34 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12586,34 +12602,34 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12741,12 +12757,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -12757,19 +12772,21 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13107,46 +13124,46 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13259,45 +13276,46 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index f869b57..053efdc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -9266,7 +9266,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9280,20 +9280,22 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9413,7 +9415,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -9428,19 +9430,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9767,34 +9771,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -9920,12 +9924,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -9936,19 +9939,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10285,34 +10290,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10439,12 +10444,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10455,19 +10459,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -10801,7 +10807,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10815,18 +10821,20 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -10943,7 +10951,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
@@ -10958,17 +10966,19 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
@@ -11283,45 +11293,45 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11433,45 +11443,46 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -11784,45 +11795,45 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -11935,45 +11946,46 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -12287,27 +12299,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -12407,27 +12420,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -12696,32 +12710,33 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -12811,33 +12826,34 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -13095,34 +13111,34 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4
; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13250,12 +13266,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -13266,19 +13281,21 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -13614,46 +13631,46 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
@@ -13766,45 +13783,46 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7
-; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7
+; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index d588f0e..723e3ef 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4007,6 +4007,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, 0, 16, v0
; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
@@ -4053,6 +4055,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, 0, 16, v0
; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
@@ -4411,7 +4415,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
; GFX12-GISEL-TRUE16: ; %bb.0:
; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
@@ -4457,7 +4461,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
; GFX12-GISEL-TRUE16: ; %bb.0:
; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0
; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
@@ -4882,7 +4886,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
@@ -5002,7 +5006,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 5e50288..7ebd692 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2724,31 +2724,31 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v4.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v6
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 0d33400..3261e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -917,47 +917,91 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
}
define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
- ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
- ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX11-WF32-NEXT: {{ $}}
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-WF32-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
- ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
- ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
- ; DAGISEL-GFX11-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-TRUE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX11-WF32-TRUE16: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[COPY]], %subreg.hi16
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-TRUE16-NEXT: S_ENDPGM 0
;
- ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat
- ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
- ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8
- ; DAGISEL-GFX11-WF64-NEXT: {{ $}}
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-WF64-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
- ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
- ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
- ; DAGISEL-GFX11-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
- ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
- ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
+ ; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX11-WF32-FAKE16: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF32-FAKE16-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-TRUE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX11-WF64-TRUE16: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[COPY]], %subreg.hi16
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-TRUE16-NEXT: S_ENDPGM 0
+ ;
+ ; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_cc_bfloat
+ ; DAGISEL-GFX11-WF64-FAKE16: bb.0 (%ir-block.0):
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: liveins: $sgpr0, $vgpr8
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: {{ $}}
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+ ; DAGISEL-GFX11-WF64-FAKE16-NEXT: S_ENDPGM 0
;
; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
new file mode 100644
index 0000000..3a55070
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
+
+declare i16 @llvm.amdgcn.sat.pk4.i4.i8(i32) #0
+declare i16 @llvm.amdgcn.sat.pk4.u4.u8(i32) #0
+
+define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_clause 0x1
+; SDAG-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_clause 0x1
+; SDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s2
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
+; GISEL-REAL16: ; %bb.0:
+; GISEL-REAL16-NEXT: s_clause 0x1
+; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_clause 0x1
+; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: s_endpgm
+ %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
+ store i16 %cvt, ptr %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
+; SDAG-REAL16: ; %bb.1:
+; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0
+; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-REAL16-NEXT: s_branch .LBB1_0
+; SDAG-REAL16-NEXT: .p2align 8
+; SDAG-REAL16-NEXT: ; %bb.2:
+; SDAG-REAL16-NEXT: .LBB1_0:
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s8
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
+; SDAG-FAKE16: ; %bb.1:
+; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0
+; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-FAKE16-NEXT: s_branch .LBB1_0
+; SDAG-FAKE16-NEXT: .p2align 8
+; SDAG-FAKE16-NEXT: ; %bb.2:
+; SDAG-FAKE16-NEXT: .LBB1_0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s8
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
+; GISEL-REAL16: ; %bb.0:
+; GISEL-REAL16-NEXT: s_clause 0x1
+; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_clause 0x1
+; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: s_endpgm
+ %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
+ store i16 %cvt, ptr %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, 0x64
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
+; GISEL-REAL16: ; %bb.0:
+; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64
+; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, 0x64
+; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: s_endpgm
+ %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0
+ store i16 %cvt, ptr %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_clause 0x1
+; SDAG-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_clause 0x1
+; SDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s2
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
+; GISEL-REAL16: ; %bb.0:
+; GISEL-REAL16-NEXT: s_clause 0x1
+; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_clause 0x1
+; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: s_endpgm
+ %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
+ store i16 %cvt, ptr %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
+; SDAG-REAL16: ; %bb.1:
+; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0
+; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-REAL16-NEXT: s_branch .LBB4_0
+; SDAG-REAL16-NEXT: .p2align 8
+; SDAG-REAL16-NEXT: ; %bb.2:
+; SDAG-REAL16-NEXT: .LBB4_0:
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s8
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
+; SDAG-FAKE16: ; %bb.1:
+; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0
+; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-FAKE16-NEXT: s_branch .LBB4_0
+; SDAG-FAKE16-NEXT: .p2align 8
+; SDAG-FAKE16-NEXT: ; %bb.2:
+; SDAG-FAKE16-NEXT: .LBB4_0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s8
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
+; GISEL-REAL16: ; %bb.0:
+; GISEL-REAL16-NEXT: s_clause 0x1
+; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_clause 0x1
+; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: s_endpgm
+ %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
+ store i16 %cvt, ptr %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
+; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, 0x64
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+;
+; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
+; GISEL-REAL16: ; %bb.0:
+; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64
+; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: s_endpgm
+;
+; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
+; GISEL-FAKE16: ; %bb.0:
+; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, 0x64
+; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: s_endpgm
+ %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0
+ store i16 %cvt, ptr %out, align 2
+ ret void
+}
+
+attributes #0 = { nounwind memory(none) }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
index 8140866..ed6a02b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s
; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s
@@ -17,6 +19,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -29,6 +43,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vi:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -41,6 +71,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vl:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0xc1d1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -54,6 +100,23 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_iv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v0, 1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_iv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -67,6 +130,23 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_ss:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_ss:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -80,6 +160,23 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_sv:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_sv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -92,6 +189,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vs:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
ret { i32, i32 } %v
}
@@ -102,6 +215,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 false)
ret { i32, i32 } %v
}
@@ -112,6 +237,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_bc(i32 %vdst_old, i32 %src0_old) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 true)
ret { i32, i32 } %v
}
@@ -122,6 +259,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi_bc(i32 %vdst_old, i32 %src0_old
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX950-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi_bc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 true)
ret { i32, i32 } %v
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
index 2faf375..465414c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
@@ -1,5 +1,7 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.prng.b32(i32) #0
@@ -29,4 +31,4 @@ define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 {
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind } \ No newline at end of file
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
index 0f37639..52f6dab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll
@@ -16,14 +16,15 @@ define bfloat @v_exp2_bf16(bfloat %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -90,26 +91,25 @@ define bfloat @v_exp2_fabs_bf16(bfloat %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -169,26 +169,25 @@ define bfloat @v_exp2_fneg_fabs_bf16(bfloat %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v1.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -249,26 +248,25 @@ define bfloat @v_exp2_fneg_bf16(bfloat %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -328,14 +326,15 @@ define bfloat @v_exp2_bf16_fast(bfloat %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -642,47 +641,47 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 15
-; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
-; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v2.l
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
-; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, s0
-; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
-; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v1.l
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v2
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v0.l
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
+; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16:
@@ -774,46 +773,44 @@ define <2 x bfloat> @v_exp2_fneg_v2bf16(<2 x bfloat> %in) {
; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2
+; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0
; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1
-; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3
+; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 0d5846a..5634df5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6377,28 +6377,99 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1100-LABEL: v_log_f32_from_fpext_bf16:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-SDAG-TRUE16: ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-SDAG-FAKE16: ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-GISEL-TRUE16: ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-GISEL-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_f32_from_fpext_bf16:
+; GFX1100-GISEL-FAKE16: ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-GISEL-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_log_f32_from_fpext_bf16:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 8006876..8d1a231 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6377,28 +6377,99 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1100-LABEL: v_log10_f32_from_fpext_bf16:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
-; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-SDAG-TRUE16: ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-SDAG-FAKE16: ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-GISEL-TRUE16: ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-GISEL-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_f32_from_fpext_bf16:
+; GFX1100-GISEL-FAKE16: ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-GISEL-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_log10_f32_from_fpext_bf16:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index c1ac74e..7ca72bf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3830,20 +3830,67 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1100-LABEL: v_log2_f32_from_fpext_bf16:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
-; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-SDAG-TRUE16: ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-SDAG-FAKE16: ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-GISEL-TRUE16: ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0
+; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_f32_from_fpext_bf16:
+; GFX1100-GISEL-FAKE16: ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT: v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_log2_f32_from_fpext_bf16:
; R600: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 6e94896..c0fb145 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -193,13 +193,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
}
define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
-; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; SDAG-GFX9: ; %bb.0:
@@ -265,13 +274,22 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
}
define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
-; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; SDAG-GFX9: ; %bb.0:
@@ -569,3 +587,4 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign"
attributes #1 = { nounwind readnone speculatable }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GISEL-GFX11-FAKE16: {{.*}}
+; SDAG-GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index 6246f2f..ca16e25 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -118,34 +118,29 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
; GFX11-TRUE16-LABEL: v_maximumnum_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_bf16:
@@ -181,40 +176,34 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_bf16:
@@ -339,21 +328,21 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan:
@@ -381,25 +370,25 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan:
@@ -630,58 +619,46 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16:
@@ -738,62 +715,56 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16:
@@ -1012,34 +983,29 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
@@ -1085,36 +1051,35 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_nnan:
@@ -1444,66 +1409,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1575,77 +1541,80 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16:
@@ -1939,41 +1908,40 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v6
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2029,48 +1997,50 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v6
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_nnan:
@@ -2507,85 +2477,83 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -2680,99 +2648,98 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16:
@@ -3158,53 +3125,52 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v5, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v7, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v6, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.h
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3273,62 +3239,63 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v5, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v7, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v6, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.h
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_nnan:
@@ -3957,125 +3924,120 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v10.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v13, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v10, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v11.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v1, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v8.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -4206,142 +4168,141 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v13, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v10.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v7
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v10, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v11.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v12.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v1.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v1, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v9.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v0, v9
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v8.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v6bf16:
@@ -5219,171 +5180,160 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v15, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v13, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v11.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v13.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v2, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v12.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v11, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v13, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v15, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v13
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v1, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v0, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v12.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v11.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v14.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v15.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -5546,201 +5496,187 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v15, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v13, v18
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v11.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v11, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v13, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v13.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v8.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v15, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v15
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v2, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v13
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v1.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v1, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v12.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v0, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v12.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v11.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v14.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v15.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v8bf16:
@@ -7352,341 +7288,314 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
; GFX11-TRUE16-LABEL: v_maximumnum_v16bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v19.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v26, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v27, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v21, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v23.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v17.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v16, v25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v25.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v16, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v16, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v16, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v16, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v7.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v7, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v6, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v5, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v4, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v2, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v1, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v17.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v18.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v32.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v28.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v32, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v29.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v22, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -8005,406 +7914,355 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v18, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v19.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v24, v25
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v22.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v26, v21
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v21.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v27, v25
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v21, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v23.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v19.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v17.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v18
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v16, v25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v25.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v16, v26
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v16, v27
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v12.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v22, v17
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v16, v28
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v16, v29
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v16, v30
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v7.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v7, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v6.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v6, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v5.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v5, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v16.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v4.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v4, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v3, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v16.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v2, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v1, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v17.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v18.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v32.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v28.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v32, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v29.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v16bf16:
@@ -11681,666 +11539,619 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: scratch_load_b32 v68, off, s32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s40, v86, v118
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s63, v116, v86
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s42, v87, v118
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s43, v96, v119
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s45, v98, v129
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s56, v101, v132
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s59, v112, v135
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s60, v113, v144
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v39.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v50.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v51.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s46, v99, v130
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s61, v114, v145
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s62, v115, v146
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v33.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v37.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v52.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v70.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v81.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v97, v128
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s57, v102, v133
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v48.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v65.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v66.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s47, v100, v131
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s58, v103, v134
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v38.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v49.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v64.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v71.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v80.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v85.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v67.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v84.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v82.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v83.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v69.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0, v96.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v35
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v35.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v36, v37
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0, v37.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v36, v38
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v36, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v15.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v50.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v52, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v36, v48
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v36, v49
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v36, v50
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v36, v51
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v36, v52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v36, v53
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v36, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v36, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v36, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v36, v65
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v36, v66
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s14, v36, v67
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v82.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s15, v82, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v14.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v38, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v31.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s16, v14, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s17, v13, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v12.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s18, v12, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v36.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v11.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s19, v11, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v10.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s20, v10, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v9.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s21, v9, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v8.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s22, v8, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s23, v7, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s24, v6, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s25, v5, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s26, v4, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v52, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v51, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v29.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v51, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v27, v26
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v25, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v11
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v16
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s27, v83, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0, v82.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v39.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v19.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v38.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v82, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v50.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v51.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v54.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v52.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v53.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v55.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v19, v36
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v66.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v65.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v67.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v64.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v96, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16:
@@ -12956,753 +12767,697 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: scratch_load_b32 v68, off, s32
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27
; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s40, v86, v118
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v32.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s63, v116, v86
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v55.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s42, v87, v118
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s43, v96, v119
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s45, v98, v129
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s56, v101, v132
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s59, v112, v135
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s60, v113, v144
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v34.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v39.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v50.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v51.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s46, v99, v130
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s61, v114, v145
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s62, v115, v146
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v33.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v37.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v52.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v53.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v70.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v81.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v54.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v97, v128
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v35.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s57, v102, v133
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v48.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v65.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v66.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s47, v100, v131
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s58, v103, v134
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v38.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v49.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v64.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v71.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v80.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v85.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v67.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v68.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v84.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v82.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v83.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v69.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0, v96.h
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v15.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v50.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v52, v53
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v51.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v38, v53
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v31.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v35
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v35.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v36, v37
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0, v37.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v52, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v36, v38
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v51, v50
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v29.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v36, v39
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v28.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v36, v48
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v36, v49
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v51, v50
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v36, v50
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v27, v26
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v36, v51
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v36, v52
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v25, v24
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v36, v53
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v36, v54
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v8.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v36, v55
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v36, v64
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v36, v65
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v36, v66
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s14, v36, v67
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v82.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s15, v82, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v14.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s16, v14, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v13.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s17, v13, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v36.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s18, v12, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v36.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v11.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v16
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s19, v11, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v10.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s20, v10, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v9.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s21, v9, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v8.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s22, v8, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v7.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s23, v7, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v6.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s24, v6, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v5.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v12, v37
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s25, v5, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v4.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s26, v4, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v83.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s27, v83, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0, v82.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v39.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v19.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v38.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v82, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v50.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v51.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v54.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v52.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v53.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v55.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v19, v36
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v66.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v65.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v67.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v64.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v96, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v32bf16:
@@ -14612,34 +14367,29 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
; GFX11-TRUE16-LABEL: v_maximumnum_bf16_no_ieee:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14675,40 +14425,34 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_bf16_no_ieee:
@@ -14949,58 +14693,46 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -15057,62 +14789,56 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee:
@@ -15458,66 +15184,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -15589,77 +15316,80 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee:
@@ -16117,85 +15847,83 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
@@ -16290,99 +16018,98 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index 678d0a4..416a601 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -120,34 +120,29 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
; GFX11-TRUE16-LABEL: v_minimumnum_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_bf16:
@@ -183,40 +178,34 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_bf16:
@@ -344,21 +333,21 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan:
@@ -386,25 +375,25 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan:
@@ -639,58 +628,46 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16:
@@ -747,62 +724,56 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16:
@@ -1024,34 +995,29 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
@@ -1097,36 +1063,35 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y)
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_nnan:
@@ -1459,66 +1424,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1590,77 +1556,80 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16:
@@ -1957,41 +1926,40 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v6
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2047,48 +2015,50 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y)
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v6
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_nnan:
@@ -2528,85 +2498,83 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -2701,99 +2669,98 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16:
@@ -3181,53 +3148,52 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v5, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v7, v6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v6, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.h
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3296,62 +3262,63 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y)
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v5, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v7, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v6, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.h
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_nnan:
@@ -3984,125 +3951,120 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v10.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v13, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v10, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v11.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v1, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v0, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v8.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -4233,142 +4195,141 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v13, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v10.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v7
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v10, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v11.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v1.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v1, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v9.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v0, v9
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v8.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v6bf16:
@@ -5250,171 +5211,160 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v15, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v13, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v11.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v13.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v2, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v11, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v13, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v15, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v13
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v1, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v0, v12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v11.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v14.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v15.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -5577,201 +5527,187 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v15, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v13, v18
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v11.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v11, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v13, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v13.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v8.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v14
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v15, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v15
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v2, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v13
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v1.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v1, v12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v0, v12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v12.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v11.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v14.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v15.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v8bf16:
@@ -7391,341 +7327,314 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
; GFX11-TRUE16-LABEL: v_minimumnum_v16bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v19.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v24, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v26, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v27, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v21, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v23.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v17.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v16, v25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v25.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v16, v27
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v16, v28
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v16, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v16, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v7.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v7, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v6, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v5, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v4, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v3, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v2, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v1, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v17.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v18.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v32.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v28.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v32, v16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v29.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v22, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -8044,406 +7953,355 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v18, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v19.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v24, v25
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v22.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v26, v21
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l
; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v21.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v27, v25
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v21, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v23.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v19.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v17.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v18
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v16, v25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v25.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v16, v26
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v16, v27
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v12.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v22, v17
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v16, v28
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v16, v29
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v16, v30
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v7.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v7, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v6.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v6, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v5.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v5, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v4.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v4, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v3.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v3, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v2.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v2, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v19
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v1.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v1, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v17.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v18.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v32.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v28.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v32, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v29.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v16bf16:
@@ -11736,666 +11594,619 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: scratch_load_b32 v68, off, s32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27
; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s40, v86, v118
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s63, v116, v86
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s42, v87, v118
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s43, v96, v119
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s45, v98, v129
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s56, v101, v132
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s59, v112, v135
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s60, v113, v144
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v39.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v50.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v51.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s46, v99, v130
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s61, v114, v145
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s62, v115, v146
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v33.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v37.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v52.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v70.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v81.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v97, v128
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s57, v102, v133
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v48.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v65.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v66.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s47, v100, v131
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s58, v103, v134
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v38.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v49.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v64.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v71.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v80.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v85.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v67.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v84.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v82.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v83.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v69.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0x8000, v96.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v35
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v35.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v36, v37
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0x8000, v37.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v36, v38
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v36, v39
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v15.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v50.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v52, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v36, v48
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v36, v49
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v36, v50
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v36, v51
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v36, v52
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v36, v53
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v36, v54
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v36, v55
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v36, v64
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v36, v65
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v36, v66
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s14, v36, v67
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v82.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s15, v82, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v14.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v38, v53
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v31.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s16, v14, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v13.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s17, v13, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v12.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s18, v12, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v11.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s19, v11, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v10.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s20, v10, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v9.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s21, v9, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v8.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s22, v8, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s23, v7, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s24, v6, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v5.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s25, v5, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s26, v4, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v52, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v51, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v29.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v51, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v27, v26
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v25, v24
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v11
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v16
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s27, v83, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0x8000, v82.h
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v39.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v19.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v38.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v82, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v50.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v51.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v54.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v52.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v53.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v55.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v19, v36
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v66.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v65.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v67.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v64.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v96, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16:
@@ -13011,753 +12822,697 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: scratch_load_b32 v31, off, s32
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: scratch_load_b32 v68, off, s32
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27
; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s40, v86, v118
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v32.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s63, v116, v86
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v55.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s42, v87, v118
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s43, v96, v119
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s45, v98, v129
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s56, v101, v132
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s59, v112, v135
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s60, v113, v144
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v34.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v39.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v50.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v51.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s46, v99, v130
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s61, v114, v145
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s62, v115, v146
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v33.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v37.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v52.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v53.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v70.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v81.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v54.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v97, v128
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v35.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s57, v102, v133
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v48.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v65.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v66.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s47, v100, v131
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s58, v103, v134
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v38.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v49.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v64.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v71.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v80.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v85.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v67.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v68.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v84.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v82.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v83.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v69.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12
-; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0x8000, v96.h
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v15.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v50.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v52, v53
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v51.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v38, v53
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v31.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v35
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v35.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v36, v37
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0x8000, v37.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v52, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v36, v38
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v51, v50
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v29.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v36, v39
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v28.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v36, v48
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v36, v49
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v51, v50
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v36, v50
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v27, v26
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v36, v51
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v36, v52
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v25, v24
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v36, v53
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v36, v54
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v8.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v36, v55
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v36, v64
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v36, v65
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v36, v66
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s14, v36, v67
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v82.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s15, v82, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v14.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s16, v14, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v13.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s17, v13, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v12.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s18, v12, v36
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v11.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v16
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s19, v11, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v10.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s20, v10, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v9.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s21, v9, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v8.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s22, v8, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v7.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s23, v7, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s24, v6, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v5.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30
-; GFX12-TRUE16-NEXT: v_mov_b32_e32 v12, v37
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s25, v5, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s26, v4, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v83.h
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s27, v83, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0x8000, v82.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v39.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v19.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v38.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v82, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v50.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v51.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v54.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v52.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v53.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v55.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v19, v36
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v66.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v65.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v67.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v64.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v96, v36
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v32bf16:
@@ -14669,34 +14424,29 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
; GFX11-TRUE16-LABEL: v_minimumnum_bf16_no_ieee:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -14732,40 +14482,34 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_bf16_no_ieee:
@@ -15009,58 +14753,46 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -15117,62 +14849,56 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> %
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee:
@@ -15521,66 +15247,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -15652,77 +15379,80 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> %
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee:
@@ -16183,85 +15913,83 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
@@ -16356,99 +16084,98 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> %
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8
-; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
-; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l
-; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9
-; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h
+; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3
+; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1
+; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 42bd2ff..9f539bd 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -813,7 +813,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
@@ -824,11 +825,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: v_pk_sub_i16 v0, v1, v0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index 184c807..ddae1b2 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -1718,7 +1718,7 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
@@ -1751,7 +1751,7 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index cbc9f70..aba20e6 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -2311,4 +2311,51 @@ entry:
ret void
}
+define <4 x float> @test_uitofp_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: test_uitofp_v4i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_v4i8_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r5, %r4;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r7, %r6;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7770U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r9, %r8;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3};
+; CHECK-NEXT: ret;
+ %r = uitofp <4 x i8> %a to <4 x float>
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_sitofp_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: test_sitofp_v4i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_v4i8_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0xbbb3U;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
+; CHECK-NEXT: cvt.rn.f32.s16 %r3, %rs1;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
+; CHECK-NEXT: cvt.rn.f32.s16 %r5, %rs2;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r6;
+; CHECK-NEXT: cvt.rn.f32.s16 %r7, %rs3;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x8880U;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r8;
+; CHECK-NEXT: cvt.rn.f32.s16 %r9, %rs4;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3};
+; CHECK-NEXT: ret;
+ %r = sitofp <4 x i8> %a to <4 x float>
+ ret <4 x float> %r
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll
new file mode 100644
index 0000000..404c423
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_80 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_80 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define float @uitofp_trunc_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: uitofp_trunc_nuw(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nuw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nuw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nuw i32 %v to i16
+ %f = uitofp i16 %t to float
+ ret float %f
+}
+
+define float @sitofp_trunc_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: sitofp_trunc_nsw(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nsw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nsw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nsw i32 %v to i16
+ %f = sitofp i16 %t to float
+ ret float %f
+}
+
+define float @uitofp_trunc_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: uitofp_trunc_nsw(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nsw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nsw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: cvt.rn.f32.u16 %r4, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nsw i32 %v to i16
+ %f = uitofp i16 %t to float
+ ret float %f
+}
+
+define float @sitofp_trunc_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: sitofp_trunc_nuw(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nuw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nuw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: cvt.rn.f32.s16 %r4, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nuw i32 %v to i16
+ %f = sitofp i16 %t to float
+ ret float %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 9af92aa..578b67e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -538,3 +538,164 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
%res7 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6, <vscale x 8 x i8> %t7, 7
ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res7
}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
+; CHECK-LABEL: masked_load_factor2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vl4r.v v12, (a0)
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v12, 0
+; CHECK-NEXT: vnsrl.wi v10, v12, 8
+; CHECK-NEXT: ret
+ %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
+ %deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %deinterleaved.results
+}
+
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) {
+; CHECK-LABEL: masked_loat_factor4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 2
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; CHECK-NEXT: vl4r.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
+ %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
+ ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
+}
+
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: masked_loat_factor4_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: vmv.v.v v9, v8
+; CHECK-NEXT: srli a4, a2, 2
+; CHECK-NEXT: vmv.v.v v10, v8
+; CHECK-NEXT: srli a5, a2, 3
+; CHECK-NEXT: vmv.v.v v11, v8
+; CHECK-NEXT: vsseg4e8.v v8, (a1)
+; CHECK-NEXT: vl1r.v v8, (a1)
+; CHECK-NEXT: add a1, a4, a5
+; CHECK-NEXT: vl1r.v v9, (a3)
+; CHECK-NEXT: add a3, a3, a2
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vl1r.v v10, (a3)
+; CHECK-NEXT: vl1r.v v11, (a2)
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmsne.vi v10, v11, 0
+; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v0, v9, a5
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v0, v8, a4
+; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v10, a1
+; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+ %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
+ %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
+ ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
+}
+
+; Negative test - some of the deinterleaved elements might come from the
+; passthru not the load
+define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_passthru(ptr %p, <vscale x 8 x i1> %mask, <vscale x 32 x i8> %passthru) {
+; CHECK-LABEL: masked_loat_factor4_passthru:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT: add a3, a1, a2
+; CHECK-NEXT: vmv.v.v v13, v12
+; CHECK-NEXT: srli a4, a2, 2
+; CHECK-NEXT: vmv.v.v v14, v12
+; CHECK-NEXT: srli a5, a2, 3
+; CHECK-NEXT: vmv.v.v v15, v12
+; CHECK-NEXT: vsseg4e8.v v12, (a1)
+; CHECK-NEXT: vl1r.v v12, (a1)
+; CHECK-NEXT: add a1, a4, a5
+; CHECK-NEXT: vl1r.v v13, (a3)
+; CHECK-NEXT: add a3, a3, a2
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vl1r.v v14, (a3)
+; CHECK-NEXT: vl1r.v v15, (a2)
+; CHECK-NEXT: vmsne.vi v13, v13, 0
+; CHECK-NEXT: vmsne.vi v0, v12, 0
+; CHECK-NEXT: vmsne.vi v12, v14, 0
+; CHECK-NEXT: vmsne.vi v14, v15, 0
+; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v0, v13, a5
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v0, v12, a4
+; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v14, a1
+; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu
+; CHECK-NEXT: vle8.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
+ %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> %passthru)
+ %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
+ ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results
+}
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index 061435c..59a702a 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -798,12 +798,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
; RV64SFBSIFIVEU74-LABEL: sextw_removal_ccor:
; RV64SFBSIFIVEU74: # %bb.0: # %bb
; RV64SFBSIFIVEU74-NEXT: addi sp, sp, -32
-; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64SFBSIFIVEU74-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: mv s0, a3
+; RV64SFBSIFIVEU74-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: andi a0, a0, 1
; RV64SFBSIFIVEU74-NEXT: mv s1, a2
+; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: beqz a0, .LBB15_4
; RV64SFBSIFIVEU74-NEXT: # %bb.3: # %bb
; RV64SFBSIFIVEU74-NEXT: or s0, a3, a1
@@ -824,11 +824,11 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
; RV64SFBANDESAX45-LABEL: sextw_removal_ccor:
; RV64SFBANDESAX45: # %bb.0: # %bb
; RV64SFBANDESAX45-NEXT: addi sp, sp, -32
-; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: mv s0, a3
+; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: mv s1, a2
+; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: nds.bbc a0, 0, .LBB15_2
; RV64SFBANDESAX45-NEXT: # %bb.1:
; RV64SFBANDESAX45-NEXT: or s0, s0, a1
@@ -848,12 +848,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
; ZICOND-LABEL: sextw_removal_ccor:
; ZICOND: # %bb.0: # %bb
; ZICOND-NEXT: addi sp, sp, -32
-; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; ZICOND-NEXT: mv s0, a3
+; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; ZICOND-NEXT: andi a0, a0, 1
; ZICOND-NEXT: mv s1, a2
+; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; ZICOND-NEXT: beqz a0, .LBB15_4
; ZICOND-NEXT: # %bb.3: # %bb
; ZICOND-NEXT: or s0, a3, a1
@@ -874,12 +874,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32
; RV32SFB-LABEL: sextw_removal_ccor:
; RV32SFB: # %bb.0: # %bb
; RV32SFB-NEXT: addi sp, sp, -16
-; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
-; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: mv s0, a3
+; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: andi a0, a0, 1
; RV32SFB-NEXT: mv s1, a2
+; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: beqz a0, .LBB15_4
; RV32SFB-NEXT: # %bb.3: # %bb
; RV32SFB-NEXT: or s0, a3, a1
@@ -941,11 +941,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
; RV64SFBSIFIVEU74-LABEL: sextw_removal_ccaddw:
; RV64SFBSIFIVEU74: # %bb.0: # %bb
; RV64SFBSIFIVEU74-NEXT: addi sp, sp, -32
-; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; RV64SFBSIFIVEU74-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: mv s1, a1
; RV64SFBSIFIVEU74-NEXT: andi a0, a0, 1
+; RV64SFBSIFIVEU74-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBSIFIVEU74-NEXT: mv s0, a2
; RV64SFBSIFIVEU74-NEXT: beqz a0, .LBB16_4
; RV64SFBSIFIVEU74-NEXT: # %bb.3: # %bb
@@ -967,11 +967,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
; RV64SFBANDESAX45-LABEL: sextw_removal_ccaddw:
; RV64SFBANDESAX45: # %bb.0: # %bb
; RV64SFBANDESAX45-NEXT: addi sp, sp, -32
-; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
-; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: mv s0, a2
+; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: mv s1, a1
+; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; RV64SFBANDESAX45-NEXT: nds.bbc a0, 0, .LBB16_2
; RV64SFBANDESAX45-NEXT: # %bb.1:
; RV64SFBANDESAX45-NEXT: addw s1, s1, a3
@@ -991,11 +991,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
; ZICOND-LABEL: sextw_removal_ccaddw:
; ZICOND: # %bb.0: # %bb
; ZICOND-NEXT: addi sp, sp, -32
-; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
-; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
; ZICOND-NEXT: mv s1, a1
; ZICOND-NEXT: andi a0, a0, 1
+; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; ZICOND-NEXT: mv s0, a2
; ZICOND-NEXT: beqz a0, .LBB16_4
; ZICOND-NEXT: # %bb.3: # %bb
@@ -1017,11 +1017,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
; RV32SFB-LABEL: sextw_removal_ccaddw:
; RV32SFB: # %bb.0: # %bb
; RV32SFB-NEXT: addi sp, sp, -16
-; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: mv s1, a1
; RV32SFB-NEXT: andi a0, a0, 1
+; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32SFB-NEXT: mv s0, a2
; RV32SFB-NEXT: beqz a0, .LBB16_4
; RV32SFB-NEXT: # %bb.3: # %bb
diff --git a/llvm/test/CodeGen/RISCV/zdinx-spill.ll b/llvm/test/CodeGen/RISCV/zdinx-spill.ll
index d7a7006..6f206fe 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-spill.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-spill.ll
@@ -9,7 +9,6 @@ define double @foo(double %x) nounwind {
; CHECK-NEXT: liveins: $x10, $x11, $x8, $x9, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -64
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 64
; CHECK-NEXT: frame-setup SW killed $x8, $x2, 60 :: (store (s32) into %stack.1)
; CHECK-NEXT: frame-setup SW killed $x9, $x2, 56 :: (store (s32) into %stack.2)
; CHECK-NEXT: frame-setup SW killed $x18, $x2, 52 :: (store (s32) into %stack.3)
@@ -22,18 +21,6 @@ define double @foo(double %x) nounwind {
; CHECK-NEXT: frame-setup SW killed $x25, $x2, 24 :: (store (s32) into %stack.10)
; CHECK-NEXT: frame-setup SW killed $x26, $x2, 20 :: (store (s32) into %stack.11)
; CHECK-NEXT: frame-setup SW killed $x27, $x2, 16 :: (store (s32) into %stack.12)
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x9, -8
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x18, -12
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x19, -16
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x20, -20
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x21, -24
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x22, -28
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x23, -32
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x24, -36
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x25, -40
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x26, -44
- ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x27, -48
; CHECK-NEXT: renamable $x10_x11 = nofpexcept FADD_D_IN32X killed renamable $x10_x11, renamable $x10_x11, 7, implicit $frm
; CHECK-NEXT: PseudoRV32ZdinxSD killed renamable $x10_x11, $x2, 8 :: (store (s64) into %stack.0, align 4)
; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $x6, 12 /* clobber */, implicit-def dead early-clobber $x7, 12 /* clobber */, implicit-def dead early-clobber $x8, 12 /* clobber */, implicit-def dead early-clobber $x9, 12 /* clobber */, implicit-def dead early-clobber $x10, 12 /* clobber */, implicit-def dead early-clobber $x11, 12 /* clobber */, implicit-def dead early-clobber $x12, 12 /* clobber */, implicit-def dead early-clobber $x13, 12 /* clobber */, implicit-def dead early-clobber $x14, 12 /* clobber */, implicit-def dead early-clobber $x15, 12 /* clobber */, implicit-def dead early-clobber $x16, 12 /* clobber */, implicit-def dead early-clobber $x17, 12 /* clobber */, implicit-def dead early-clobber $x18, 12 /* clobber */, implicit-def dead early-clobber $x19, 12 /* clobber */, implicit-def dead early-clobber $x20, 12 /* clobber */, implicit-def dead early-clobber $x21, 12 /* clobber */, implicit-def dead early-clobber $x22, 12 /* clobber */, implicit-def dead early-clobber $x23, 12 /* clobber */, implicit-def dead early-clobber $x24, 12 /* clobber */, implicit-def dead early-clobber $x25, 12 /* clobber */, implicit-def dead early-clobber $x26, 12 /* clobber */, implicit-def dead early-clobber $x27, 12 /* clobber */, implicit-def dead early-clobber $x28, 12 /* clobber */, implicit-def dead early-clobber $x29, 12 /* clobber */, implicit-def dead early-clobber $x31
@@ -50,20 +37,7 @@ define double @foo(double %x) nounwind {
; CHECK-NEXT: $x25 = frame-destroy LW $x2, 24 :: (load (s32) from %stack.10)
; CHECK-NEXT: $x26 = frame-destroy LW $x2, 20 :: (load (s32) from %stack.11)
; CHECK-NEXT: $x27 = frame-destroy LW $x2, 16 :: (load (s32) from %stack.12)
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x9
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x18
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x19
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x20
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x21
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x22
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x23
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x24
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x25
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x26
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x27
; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 64
- ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
%a = fadd double %x, %x
call void asm sideeffect "", "~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{xr0},~{x31}"()
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index 5dcf190..834dfd6 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -8,7 +8,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F
@@ -492,11 +493,20 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v8i16:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v8i16:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm1
+; GFNISSE2-NEXT: psrlw $8, %xmm1
+; GFNISSE2-NEXT: psllw $8, %xmm0
+; GFNISSE2-NEXT: por %xmm1, %xmm0
+; GFNISSE2-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i16:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX-LABEL: test_bitreverse_v8i16:
; GFNIAVX: # %bb.0:
@@ -605,11 +615,25 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v4i32:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v4i32:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: pxor %xmm1, %xmm1
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm2, %xmm0
+; GFNISSE2-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v4i32:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX-LABEL: test_bitreverse_v4i32:
; GFNIAVX: # %bb.0:
@@ -720,11 +744,27 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v2i64:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v2i64:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: pxor %xmm1, %xmm1
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm2, %xmm0
+; GFNISSE2-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v2i64:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX-LABEL: test_bitreverse_v2i64:
; GFNIAVX: # %bb.0:
@@ -1042,15 +1082,30 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v16i16:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT: pshufb %xmm2, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT: pshufb %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v16i16:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE2-NEXT: psrlw $8, %xmm2
+; GFNISSE2-NEXT: psllw $8, %xmm0
+; GFNISSE2-NEXT: por %xmm2, %xmm0
+; GFNISSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE2-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE2-NEXT: psrlw $8, %xmm3
+; GFNISSE2-NEXT: psllw $8, %xmm1
+; GFNISSE2-NEXT: por %xmm3, %xmm1
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm2, %xmm1
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v16i16:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX1-LABEL: test_bitreverse_v16i16:
; GFNIAVX1: # %bb.0:
@@ -1241,15 +1296,39 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v8i32:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT: pshufb %xmm2, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT: pshufb %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v8i32:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: pxor %xmm2, %xmm2
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm3
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm3, %xmm0
+; GFNISSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE2-NEXT: movdqa %xmm1, %xmm4
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm4, %xmm1
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i32:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX1-LABEL: test_bitreverse_v8i32:
; GFNIAVX1: # %bb.0:
@@ -1444,15 +1523,43 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v4i64:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT: pshufb %xmm2, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
-; GFNISSE-NEXT: pshufb %xmm2, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v4i64:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: pxor %xmm2, %xmm2
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm3
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm3, %xmm0
+; GFNISSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSE2-NEXT: movdqa %xmm1, %xmm4
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm4, %xmm1
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v4i64:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
+; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX1-LABEL: test_bitreverse_v4i64:
; GFNIAVX1: # %bb.0:
@@ -2035,19 +2142,44 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; XOPAVX2-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v32i16:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; GFNISSE-NEXT: pshufb %xmm4, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT: pshufb %xmm4, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT: pshufb %xmm4, %xmm2
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT: pshufb %xmm4, %xmm3
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v32i16:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm4
+; GFNISSE2-NEXT: psrlw $8, %xmm4
+; GFNISSE2-NEXT: psllw $8, %xmm0
+; GFNISSE2-NEXT: por %xmm4, %xmm0
+; GFNISSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm0
+; GFNISSE2-NEXT: movdqa %xmm1, %xmm5
+; GFNISSE2-NEXT: psrlw $8, %xmm5
+; GFNISSE2-NEXT: psllw $8, %xmm1
+; GFNISSE2-NEXT: por %xmm5, %xmm1
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm1
+; GFNISSE2-NEXT: movdqa %xmm2, %xmm5
+; GFNISSE2-NEXT: psrlw $8, %xmm5
+; GFNISSE2-NEXT: psllw $8, %xmm2
+; GFNISSE2-NEXT: por %xmm5, %xmm2
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm2
+; GFNISSE2-NEXT: movdqa %xmm3, %xmm5
+; GFNISSE2-NEXT: psrlw $8, %xmm5
+; GFNISSE2-NEXT: psllw $8, %xmm3
+; GFNISSE2-NEXT: por %xmm5, %xmm3
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm3
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v32i16:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX1-LABEL: test_bitreverse_v32i16:
; GFNIAVX1: # %bb.0:
@@ -2393,19 +2525,61 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; XOPAVX2-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v16i32:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; GFNISSE-NEXT: pshufb %xmm4, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT: pshufb %xmm4, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT: pshufb %xmm4, %xmm2
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT: pshufb %xmm4, %xmm3
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v16i32:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: pxor %xmm4, %xmm4
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm5
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm5, %xmm0
+; GFNISSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE2-NEXT: movdqa %xmm1, %xmm6
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm6, %xmm1
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE2-NEXT: movdqa %xmm2, %xmm6
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm6, %xmm2
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE2-NEXT: movdqa %xmm3, %xmm6
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm6, %xmm3
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v16i32:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX1-LABEL: test_bitreverse_v16i32:
; GFNIAVX1: # %bb.0:
@@ -2759,19 +2933,69 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; XOPAVX2-NEXT: retq
;
-; GFNISSE-LABEL: test_bitreverse_v8i64:
-; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; GFNISSE-NEXT: pshufb %xmm4, %xmm0
-; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
-; GFNISSE-NEXT: pshufb %xmm4, %xmm1
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
-; GFNISSE-NEXT: pshufb %xmm4, %xmm2
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
-; GFNISSE-NEXT: pshufb %xmm4, %xmm3
-; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
-; GFNISSE-NEXT: retq
+; GFNISSE2-LABEL: test_bitreverse_v8i64:
+; GFNISSE2: # %bb.0:
+; GFNISSE2-NEXT: pxor %xmm4, %xmm4
+; GFNISSE2-NEXT: movdqa %xmm0, %xmm5
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm5, %xmm0
+; GFNISSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSE2-NEXT: movdqa %xmm1, %xmm6
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm6, %xmm1
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSE2-NEXT: movdqa %xmm2, %xmm6
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm6, %xmm2
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSE2-NEXT: movdqa %xmm3, %xmm6
+; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; GFNISSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; GFNISSE2-NEXT: packuswb %xmm6, %xmm3
+; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSE2-NEXT: retq
+;
+; GFNISSSE3-LABEL: test_bitreverse_v8i64:
+; GFNISSSE3: # %bb.0:
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0
+; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
+; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3
+; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
+; GFNISSSE3-NEXT: retq
;
; GFNIAVX1-LABEL: test_bitreverse_v8i64:
; GFNIAVX1: # %bb.0:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
index 1d073cd..a878dbe 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; Test handling of llvm.lifetime intrinsics.
; RUN: opt < %s -passes=asan -asan-use-after-scope -asan-use-after-return=never -asan-use-stack-safety=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK-DEFAULT
; RUN: opt < %s -passes=asan -asan-use-after-scope -asan-use-after-return=never -asan-use-stack-safety=0 -asan-instrument-dynamic-allocas=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK-NO-DYNAMIC
@@ -8,109 +9,389 @@ target triple = "x86_64-unknown-linux-gnu"
declare void @llvm.lifetime.start.p0(i64, ptr nocapture) nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind
-; CHECK-LABEL: define void @lifetime_no_size(
define void @lifetime_no_size(i64 %i) sanitize_address {
+; CHECK-LABEL: define void @lifetime_no_size(
+; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
+; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP5]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: store i64 ptrtoint (ptr @lifetime_no_size to i64), ptr [[TMP7]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT: store i64 -868083117767659023, ptr [[TMP11]], align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP2]])
+; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP2]], i64 0, i64 [[I]]
+; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[AI]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP12]], 3
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 2147450880
+; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP16]], 0
+; CHECK-NEXT: br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB23:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: [[BB18]]:
+; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP12]], 7
+; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i8
+; CHECK-NEXT: [[TMP21:%.*]] = icmp sge i8 [[TMP20]], [[TMP16]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB23]]
+; CHECK: [[BB22]]:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP12]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB23]]:
+; CHECK-NEXT: store volatile i8 0, ptr [[AI]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[TMP2]])
+; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP9]], 0
+; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+; CHECK-NEXT: store i64 0, ptr [[TMP25]], align 1
+; CHECK-NEXT: ret void
+;
entry:
%a = alloca [2 x i32], align 4
; Poison memory in prologue: 0xf3f3f300f1f1f1f1
- ; CHECK: store i64 -868083117767659023, ptr %[[#]]
call void @llvm.lifetime.start.p0(i64 -1, ptr %a)
; Check that lifetime with no size are ignored.
- ; CHECK-NOT: store
- ; CHECK: call void @llvm.lifetime.start
%ai = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 %i
store volatile i8 0, ptr %ai, align 4
- ; CHECK: store volatile
call void @llvm.lifetime.end.p0(i64 -1, ptr %a)
; Check that lifetime with no size are ignored.
- ; CHECK-NOT: store
- ; CHECK: call void @llvm.lifetime.end
; Unpoison stack frame on exit.
- ; CHECK: store i64 0, ptr %{{[0-9]+}}
- ; CHECK: ret void
ret void
}
; Generic case of lifetime analysis.
define void @lifetime() sanitize_address {
- ; CHECK-LABEL: define void @lifetime()
+; CHECK-DEFAULT-LABEL: define void @lifetime(
+; CHECK-DEFAULT-SAME: ) #[[ATTR1]] {
+; CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = alloca i64, align 32
+; CHECK-DEFAULT-NEXT: store i64 0, ptr [[TMP1]], align 8
+; CHECK-DEFAULT-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 32
+; CHECK-DEFAULT-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-DEFAULT-NEXT: store i64 1102416563, ptr [[TMP5]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP6:%.*]] = add i64 [[TMP2]], 8
+; CHECK-DEFAULT-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-DEFAULT-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP7]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 16
+; CHECK-DEFAULT-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-DEFAULT-NEXT: store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP9]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP2]], 3
+; CHECK-DEFAULT-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880
+; CHECK-DEFAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
+; CHECK-DEFAULT-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-DEFAULT-NEXT: store i64 -868082052615769615, ptr [[TMP13]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP15]], align 1
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP4]])
+; CHECK-DEFAULT-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-DEFAULT-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP16]], 3
+; CHECK-DEFAULT-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 2147450880
+; CHECK-DEFAULT-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0
+; CHECK-DEFAULT-NEXT: br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB27:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT: [[BB22]]:
+; CHECK-DEFAULT-NEXT: [[TMP23:%.*]] = and i64 [[TMP16]], 7
+; CHECK-DEFAULT-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i8
+; CHECK-DEFAULT-NEXT: [[TMP25:%.*]] = icmp sge i8 [[TMP24]], [[TMP20]]
+; CHECK-DEFAULT-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[BB27]]
+; CHECK-DEFAULT: [[BB26]]:
+; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP16]]) #[[ATTR4]]
+; CHECK-DEFAULT-NEXT: unreachable
+; CHECK-DEFAULT: [[BB27]]:
+; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP4]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP28:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP29]], align 1
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]])
+; CHECK-DEFAULT-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP31]], align 1
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP4]])
+; CHECK-DEFAULT-NEXT: [[TMP32:%.*]] = alloca i8, i64 128, align 32
+; CHECK-DEFAULT-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-DEFAULT-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 32
+; CHECK-DEFAULT-NEXT: call void @__asan_alloca_poison(i64 [[TMP34]], i64 40)
+; CHECK-DEFAULT-NEXT: [[TMP35:%.*]] = ptrtoint ptr [[TMP32]] to i64
+; CHECK-DEFAULT-NEXT: store i64 [[TMP35]], ptr [[TMP1]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP34]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-DEFAULT-NEXT: call void @__asan_unpoison_stack_memory(i64 [[TMP37]], i64 40)
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[TMP36]])
+; CHECK-DEFAULT-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-DEFAULT-NEXT: [[TMP39:%.*]] = lshr i64 [[TMP38]], 3
+; CHECK-DEFAULT-NEXT: [[TMP40:%.*]] = add i64 [[TMP39]], 2147450880
+; CHECK-DEFAULT-NEXT: [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP42:%.*]] = load i8, ptr [[TMP41]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP43:%.*]] = icmp ne i8 [[TMP42]], 0
+; CHECK-DEFAULT-NEXT: br i1 [[TMP43]], label %[[BB44:.*]], label %[[BB49:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT: [[BB44]]:
+; CHECK-DEFAULT-NEXT: [[TMP45:%.*]] = and i64 [[TMP38]], 7
+; CHECK-DEFAULT-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i8
+; CHECK-DEFAULT-NEXT: [[TMP47:%.*]] = icmp sge i8 [[TMP46]], [[TMP42]]
+; CHECK-DEFAULT-NEXT: br i1 [[TMP47]], label %[[BB48:.*]], label %[[BB49]]
+; CHECK-DEFAULT: [[BB48]]:
+; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP38]]) #[[ATTR4]]
+; CHECK-DEFAULT-NEXT: unreachable
+; CHECK-DEFAULT: [[BB49]]:
+; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP36]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64
+; CHECK-DEFAULT-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40)
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[TMP36]])
+; CHECK-DEFAULT-NEXT: [[TMP51:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr
+; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP52]], align 1
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP4]])
+; CHECK-DEFAULT-NEXT: [[TMP53:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-DEFAULT-NEXT: [[TMP54:%.*]] = lshr i64 [[TMP53]], 3
+; CHECK-DEFAULT-NEXT: [[TMP55:%.*]] = add i64 [[TMP54]], 2147450880
+; CHECK-DEFAULT-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
+; CHECK-DEFAULT-NEXT: [[TMP57:%.*]] = load i8, ptr [[TMP56]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP58:%.*]] = icmp ne i8 [[TMP57]], 0
+; CHECK-DEFAULT-NEXT: br i1 [[TMP58]], label %[[BB59:.*]], label %[[BB64:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT: [[BB59]]:
+; CHECK-DEFAULT-NEXT: [[TMP60:%.*]] = and i64 [[TMP53]], 7
+; CHECK-DEFAULT-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i8
+; CHECK-DEFAULT-NEXT: [[TMP62:%.*]] = icmp sge i8 [[TMP61]], [[TMP57]]
+; CHECK-DEFAULT-NEXT: br i1 [[TMP62]], label %[[BB63:.*]], label %[[BB64]]
+; CHECK-DEFAULT: [[BB63]]:
+; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP53]]) #[[ATTR4]]
+; CHECK-DEFAULT-NEXT: unreachable
+; CHECK-DEFAULT: [[BB64]]:
+; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP4]], align 1
+; CHECK-DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[TMP11]], 4
+; CHECK-DEFAULT-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
+; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP66]], align 1
+; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]])
+; CHECK-DEFAULT-NEXT: [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-DEFAULT-NEXT: [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-DEFAULT-NEXT: call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]])
+; CHECK-DEFAULT-NEXT: store i64 1172321806, ptr [[TMP5]], align 8
+; CHECK-DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[TMP11]], 0
+; CHECK-DEFAULT-NEXT: [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr
+; CHECK-DEFAULT-NEXT: store i64 0, ptr [[TMP70]], align 1
+; CHECK-DEFAULT-NEXT: ret void
+;
+; CHECK-NO-DYNAMIC-LABEL: define void @lifetime(
+; CHECK-NO-DYNAMIC-SAME: ) #[[ATTR1]] {
+; CHECK-NO-DYNAMIC-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-NO-DYNAMIC-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-NO-DYNAMIC-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 32
+; CHECK-NO-DYNAMIC-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i64 1102416563, ptr [[TMP4]], align 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP6]], align 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 16
+; CHECK-NO-DYNAMIC-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP8]], align 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP1]], 3
+; CHECK-NO-DYNAMIC-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NO-DYNAMIC-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i64 -868082052615769615, ptr [[TMP12]], align 1
+; CHECK-NO-DYNAMIC-NEXT: [[TMP13:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP14]], align 1
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP3]])
+; CHECK-NO-DYNAMIC-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NO-DYNAMIC-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP15]], 3
+; CHECK-NO-DYNAMIC-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NO-DYNAMIC-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP20]], label %[[BB21:.*]], label %[[BB26:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC: [[BB21]]:
+; CHECK-NO-DYNAMIC-NEXT: [[TMP22:%.*]] = and i64 [[TMP15]], 7
+; CHECK-NO-DYNAMIC-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP24:%.*]] = icmp sge i8 [[TMP23]], [[TMP19]]
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP24]], label %[[BB25:.*]], label %[[BB26]]
+; CHECK-NO-DYNAMIC: [[BB25]]:
+; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP15]]) #[[ATTR4]]
+; CHECK-NO-DYNAMIC-NEXT: unreachable
+; CHECK-NO-DYNAMIC: [[BB26]]:
+; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[TMP3]], align 1
+; CHECK-NO-DYNAMIC-NEXT: [[TMP27:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP28]], align 1
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]])
+; CHECK-NO-DYNAMIC-NEXT: [[TMP29:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP30]], align 1
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP3]])
+; CHECK-NO-DYNAMIC-NEXT: [[ARR:%.*]] = alloca [10 x i32], align 16
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[ARR]])
+; CHECK-NO-DYNAMIC-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[ARR]] to i64
+; CHECK-NO-DYNAMIC-NEXT: [[TMP32:%.*]] = lshr i64 [[TMP31]], 3
+; CHECK-NO-DYNAMIC-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
+; CHECK-NO-DYNAMIC-NEXT: [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP36]], label %[[BB37:.*]], label %[[BB42:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC: [[BB37]]:
+; CHECK-NO-DYNAMIC-NEXT: [[TMP38:%.*]] = and i64 [[TMP31]], 7
+; CHECK-NO-DYNAMIC-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP35]]
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42]]
+; CHECK-NO-DYNAMIC: [[BB41]]:
+; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP31]]) #[[ATTR4]]
+; CHECK-NO-DYNAMIC-NEXT: unreachable
+; CHECK-NO-DYNAMIC: [[BB42]]:
+; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[ARR]], align 1
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[ARR]])
+; CHECK-NO-DYNAMIC-NEXT: [[TMP43:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP44]], align 1
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP3]])
+; CHECK-NO-DYNAMIC-NEXT: [[TMP45:%.*]] = ptrtoint ptr [[TMP3]] to i64
+; CHECK-NO-DYNAMIC-NEXT: [[TMP46:%.*]] = lshr i64 [[TMP45]], 3
+; CHECK-NO-DYNAMIC-NEXT: [[TMP47:%.*]] = add i64 [[TMP46]], 2147450880
+; CHECK-NO-DYNAMIC-NEXT: [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; CHECK-NO-DYNAMIC-NEXT: [[TMP50:%.*]] = icmp ne i8 [[TMP49]], 0
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP50]], label %[[BB51:.*]], label %[[BB56:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC: [[BB51]]:
+; CHECK-NO-DYNAMIC-NEXT: [[TMP52:%.*]] = and i64 [[TMP45]], 7
+; CHECK-NO-DYNAMIC-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP54:%.*]] = icmp sge i8 [[TMP53]], [[TMP49]]
+; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP54]], label %[[BB55:.*]], label %[[BB56]]
+; CHECK-NO-DYNAMIC: [[BB55]]:
+; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP45]]) #[[ATTR4]]
+; CHECK-NO-DYNAMIC-NEXT: unreachable
+; CHECK-NO-DYNAMIC: [[BB56]]:
+; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[TMP3]], align 1
+; CHECK-NO-DYNAMIC-NEXT: [[TMP57:%.*]] = add i64 [[TMP10]], 4
+; CHECK-NO-DYNAMIC-NEXT: [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP58]], align 1
+; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]])
+; CHECK-NO-DYNAMIC-NEXT: store i64 1172321806, ptr [[TMP4]], align 8
+; CHECK-NO-DYNAMIC-NEXT: [[TMP59:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NO-DYNAMIC-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
+; CHECK-NO-DYNAMIC-NEXT: store i64 0, ptr [[TMP60]], align 1
+; CHECK-NO-DYNAMIC-NEXT: ret void
+;
; Regular variable lifetime intrinsics.
%i = alloca i32, align 4
; Poison memory in prologue: F1F1F1F1F8F3F3F3
- ; CHECK: store i64 -868082052615769615, ptr %{{[0-9]+}}
; Memory is unpoisoned at llvm.lifetime.start
call void @llvm.lifetime.start.p0(i64 3, ptr %i)
- ; CHECK: store i8 4, ptr %{{[0-9]+}}
- ; CHECK-NEXT: llvm.lifetime.start
store volatile i8 0, ptr %i
- ; CHECK: store volatile
call void @llvm.lifetime.end.p0(i64 4, ptr %i)
- ; CHECK: store i8 -8, ptr %{{[0-9]+}}
- ; CHECK-NEXT: call void @llvm.lifetime.end
; Memory is poisoned at every call to llvm.lifetime.end
call void @llvm.lifetime.end.p0(i64 2, ptr %i)
- ; CHECK: store i8 -8, ptr %{{[0-9]+}}
- ; CHECK-NEXT: call void @llvm.lifetime.end
; Lifetime intrinsics for array.
%arr = alloca [10 x i32], align 16
call void @llvm.lifetime.start.p0(i64 40, ptr %arr)
- ; CHECK-DEFAULT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40)
- ; CHECK-NO-DYNAMIC-NOT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40)
store volatile i8 0, ptr %arr
- ; CHECK: store volatile
call void @llvm.lifetime.end.p0(i64 40, ptr %arr)
- ; CHECK-DEFAULT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40)
- ; CHECK-NO-DYNAMIC-NOT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40)
; One more lifetime start/end for the same variable %i.
call void @llvm.lifetime.start.p0(i64 2, ptr %i)
- ; CHECK: store i8 4, ptr %{{[0-9]+}}
- ; CHECK-NEXT: llvm.lifetime.start
store volatile i8 0, ptr %i
- ; CHECK: store volatile
call void @llvm.lifetime.end.p0(i64 4, ptr %i)
- ; CHECK: store i8 -8, ptr %{{[0-9]+}}
- ; CHECK-NEXT: llvm.lifetime.end
; Memory is unpoisoned at function exit (only once).
- ; CHECK: store i64 0, ptr %{{[0-9]+}}
- ; CHECK-NEXT: ret void
ret void
}
; Check that arguments of lifetime may come from phi nodes.
define void @phi_args(i1 %x) sanitize_address {
- ; CHECK-LABEL: define void @phi_args(i1 %x)
+; CHECK-LABEL: define void @phi_args(
+; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
+; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.2 to i64), ptr [[TMP5]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: store i64 ptrtoint (ptr @phi_args to i64), ptr [[TMP7]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT: store i64 -868082052615769615, ptr [[TMP11]], align 1
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP9]], 4
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[TMP2]])
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 3
+; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 2147450880
+; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i8 [[TMP18]], 0
+; CHECK-NEXT: br i1 [[TMP19]], label %[[BB20:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK: [[BB20]]:
+; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP14]], 7
+; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8
+; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP18]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[BB24:.*]], label %[[BB25]]
+; CHECK: [[BB24]]:
+; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP14]]) #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB25]]:
+; CHECK-NEXT: store volatile i8 0, ptr [[TMP2]], align 1
+; CHECK-NEXT: br i1 [[X]], label %[[BB0:.*]], label %[[BB1:.*]]
+; CHECK: [[BB0]]:
+; CHECK-NEXT: br label %[[BB1]]
+; CHECK: [[BB1]]:
+; CHECK-NEXT: [[I_PHI:%.*]] = phi ptr [ [[TMP2]], %[[BB25]] ], [ [[TMP2]], %[[BB0]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP9]], 4
+; CHECK-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
+; CHECK-NEXT: store i8 -8, ptr [[TMP27]], align 1
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[I_PHI]])
+; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8
+; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP9]], 0
+; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT: store i64 0, ptr [[TMP29]], align 1
+; CHECK-NEXT: ret void
+;
entry:
%i = alloca i64, align 4
; Poison memory in prologue: F1F1F1F1F8F3F3F3
- ; CHECK: store i64 -868082052615769615, ptr %{{[0-9]+}}
call void @llvm.lifetime.start.p0(i64 8, ptr %i)
- ; CHECK: store i8 0, ptr %{{[0-9]+}}
- ; CHECK-NEXT: llvm.lifetime.start
store volatile i8 0, ptr %i
- ; CHECK: store volatile
br i1 %x, label %bb0, label %bb1
@@ -120,49 +401,101 @@ bb0:
bb1:
%i.phi = phi ptr [ %i, %entry ], [ %i, %bb0 ]
call void @llvm.lifetime.end.p0(i64 8, ptr %i.phi)
- ; CHECK: store i8 -8, ptr %{{[0-9]+}}
- ; CHECK-NEXT: llvm.lifetime.end
ret void
- ; CHECK: store i64 0, ptr %{{[0-9]+}}
- ; CHECK-NEXT: ret void
}
; Check that arguments of lifetime may come from getelementptr nodes.
define void @getelementptr_args(i64 %i) sanitize_address{
- ; CHECK-LABEL: define void @getelementptr_args
+; CHECK-LABEL: define void @getelementptr_args(
+; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 1216, align 32
+; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32
+; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1184
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP0]] to ptr
+; CHECK-NEXT: store i64 1102416563, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 8
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.3 to i64), ptr [[TMP7]], align 8
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], 16
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: store i64 ptrtoint (ptr @getelementptr_args to i64), ptr [[TMP9]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP0]], 3
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT: store i32 -235802127, ptr [[TMP13]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4
+; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP14]], i64 128)
+; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP11]], 132
+; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP16]], align 1
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP11]], 140
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 150
+; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+; CHECK-NEXT: store i16 -3085, ptr [[TMP20]], align 1
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP11]], 4
+; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP21]], i64 128)
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1024, ptr [[TMP2]])
+; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 [[I]]
+; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[AI]] to i64
+; CHECK-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 3
+; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 2147450880
+; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[TMP25]], align 1
+; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i8 [[TMP26]], 0
+; CHECK-NEXT: br i1 [[TMP27]], label %[[BB28:.*]], label %[[BB29:.*]]
+; CHECK: [[BB28]]:
+; CHECK-NEXT: call void @__asan_report_store8(i64 [[TMP22]]) #[[ATTR4]]
+; CHECK-NEXT: unreachable
+; CHECK: [[BB29]]:
+; CHECK-NEXT: store ptr [[TMP2]], ptr [[AI]], align 8
+; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4
+; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP30]], i64 128)
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr [[TMP2]])
+; CHECK-NEXT: store i64 1172321806, ptr [[TMP5]], align 8
+; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP11]], 0
+; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP31]], i64 148)
+; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[TMP11]], 150
+; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT: store i16 0, ptr [[TMP33]], align 1
+; CHECK-NEXT: ret void
+;
entry:
%x = alloca [1024 x i8], align 16
%a = alloca [2 x ptr], align 8
; F1F1F1F1
- ; CHECK: store i32 -235802127, ptr %{{[0-9]+}}
- ; CHECK: call void @__asan_set_shadow_f8(i64 %[[#]], i64 128)
; 0xf2f2f2f2f2f2f2f2
- ; CHECK: store i64 -940422246894996750, ptr %[[#]]
; 0xf2f2f2f2f2f2f2f2
- ; CHECK: store i64 -940422246894996750, ptr %[[#]]
call void @llvm.lifetime.start.p0(i64 1024, ptr %x)
- ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 128)
- ; CHECK-NEXT: call void @llvm.lifetime.start
%ai = getelementptr inbounds [2 x ptr], ptr %a, i64 0, i64 %i
store ptr %x, ptr %ai, align 8
- ; CHECK: store ptr
call void @llvm.lifetime.end.p0(i64 1024, ptr %x)
- ; CHECK: call void @__asan_set_shadow_f8(i64 %{{[0-9]+}}, i64 128)
- ; CHECK-NEXT: call void @llvm.lifetime.end
ret void
- ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 148)
- ; CHECK: store i16 0, ptr %[[#]], align 1
- ; CHECK-NEXT: ret void
}
define void @zero_sized(i64 %a) #0 {
-; CHECK-LABEL: define void @zero_sized(i64 %a)
+; CHECK-LABEL: define void @zero_sized(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+; CHECK-NEXT: [[B:%.*]] = alloca [0 x i8], align 1
+; CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 0, ptr [[B]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 0, ptr [[B]])
+; CHECK-NEXT: ret void
+;
entry:
%a.addr = alloca i64, align 8
@@ -170,11 +503,13 @@ entry:
store i64 %a, ptr %a.addr, align 8
call void @llvm.lifetime.start.p0(i64 0, ptr %b) #2
- ; CHECK: call void @llvm.lifetime.start
call void @llvm.lifetime.end.p0(i64 0, ptr %b) #2
- ; CHECK: call void @llvm.lifetime.end
ret void
- ; CHECK-NEXT: ret void
}
+;.
+; CHECK-DEFAULT: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
+; CHECK-NO-DYNAMIC: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index 279bb26..811c6eb 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -163,6 +163,51 @@ v_tanh_bf16 v5, src_scc
v_tanh_bf16 v127, 0x8000
// GFX1250: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_prng_b32 v5, v1
+// GFX1250: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, v255
+// GFX1250: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, s1
+// GFX1250: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, s105
+// GFX1250: v_prng_b32_e32 v5, s105 ; encoding: [0x69,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_lo
+// GFX1250: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_hi
+// GFX1250: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, ttmp15
+// GFX1250: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, m0
+// GFX1250: v_prng_b32_e32 v5, m0 ; encoding: [0x7d,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_lo
+// GFX1250: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_hi
+// GFX1250: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, null
+// GFX1250: v_prng_b32_e32 v5, null ; encoding: [0x7c,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, -1
+// GFX1250: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, 0.5
+// GFX1250: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, src_scc
+// GFX1250: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0x96,0x0a,0x7e]
+
+v_prng_b32 v255, 0xaf123456
+// GFX1250: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
v_rcp_bf16 v5, v1
// GFX1250: v_rcp_bf16_e32 v5, v1 ; encoding: [0x01,0xf3,0x0a,0x7e]
@@ -582,3 +627,60 @@ v_cvt_f32_fp8_e32 v1, 3
v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], s5
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5 ; encoding: [0x05,0xde,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], 3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3 ; encoding: [0x83,0xde,0x08,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[4:5], v3
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3 ; encoding: [0x03,0xdf,0x08,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e]
+
+v_sat_pk4_i4_i8 v1, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, s2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2 ; encoding: [0x02,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2 ; encoding: [0x82,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v1, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, s2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2 ; encoding: [0x02,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2 ; encoding: [0x82,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index 76272d2..3ddbc36 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -169,6 +169,51 @@ v_tanh_bf16 v127, 0x8000
v_tanh_bf16 v5.h, v1.h
// GFX1250: v_tanh_bf16_e32 v5.h, v1.h ; encoding: [0x81,0x95,0x0a,0x7f]
+v_prng_b32 v5, v1
+// GFX1250: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, v255
+// GFX1250: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0x97,0x0a,0x7e]
+
+v_prng_b32 v5, s1
+// GFX1250: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, s105
+// GFX1250: v_prng_b32_e32 v5, s105 ; encoding: [0x69,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_lo
+// GFX1250: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, vcc_hi
+// GFX1250: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, ttmp15
+// GFX1250: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, m0
+// GFX1250: v_prng_b32_e32 v5, m0 ; encoding: [0x7d,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_lo
+// GFX1250: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, exec_hi
+// GFX1250: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, null
+// GFX1250: v_prng_b32_e32 v5, null ; encoding: [0x7c,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, -1
+// GFX1250: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, 0.5
+// GFX1250: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0x96,0x0a,0x7e]
+
+v_prng_b32 v5, src_scc
+// GFX1250: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0x96,0x0a,0x7e]
+
+v_prng_b32 v255, 0xaf123456
+// GFX1250: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
v_rcp_bf16 v5, v1
// GFX1250: v_rcp_bf16_e32 v5, v1 ; encoding: [0x01,0xf3,0x0a,0x7e]
@@ -618,3 +663,39 @@ v_cvt_f32_fp8_e32 v1, 3
v_cvt_f32_fp8_e32 v1, v3
// GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, s2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2 ; encoding: [0x02,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2 ; encoding: [0x82,0xe6,0x02,0x7e]
+
+v_sat_pk4_i4_i8 v1, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_i4_i8 v1.h, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1.h, v2 ; encoding: [0x02,0xe7,0x02,0x7f]
+
+v_sat_pk4_u4_u8 v1, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, s2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2 ; encoding: [0x02,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2 ; encoding: [0x82,0xe8,0x02,0x7e]
+
+v_sat_pk4_u4_u8 v1, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v1.h, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1.h, v2 ; encoding: [0x02,0xe9,0x02,0x7f]
+
+v_permlane16_swap_b32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
+
+v_permlane16_swap_b32_e32 v1, v2
+// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
index 0a8ee84..7386df8 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
@@ -170,6 +170,58 @@ v_tanh_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f
// GFX1250: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -641,3 +693,19 @@ v_cvt_pk_f16_bf8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
v_cvt_pk_f16_fp8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
index d4afb9d..0a46f2f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
@@ -178,6 +178,58 @@ v_tanh_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
// GFX1250: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shl:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_shr:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:1
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_ror:15
+// GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -697,3 +749,27 @@ v_cvt_pk_f16_fp8 v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
v_cvt_pk_f16_fp8 v1, v2.h quad_perm:[0,1,2,3]
// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
index a7cb6bf..e276309 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s
@@ -38,6 +38,18 @@ v_tanh_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX1250: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -157,3 +169,19 @@ v_cvt_pk_f16_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_cvt_pk_f16_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
index 6acab7e..359aadc 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
@@ -46,6 +46,18 @@ v_tanh_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -213,3 +225,27 @@ v_cvt_pk_f16_fp8 v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
v_cvt_pk_f16_fp8 v1, v2.h dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
index 7486d84..aa4e49d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s
@@ -127,6 +127,42 @@ v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp
v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp
// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00]
+v_prng_b32_e64 v5, v1
+// GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
+
+v_prng_b32_e64 v5, v255
+// GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00]
+
+v_prng_b32_e64 v5, s1
+// GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, s105
+// GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, vcc_lo
+// GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, vcc_hi
+// GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, ttmp15
+// GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, m0
+// GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, exec_lo
+// GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, exec_hi
+// GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, null
+// GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, -1
+// GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
+
v_tanh_f32_e64 v5, v1
// GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00]
@@ -684,3 +720,48 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, s2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, s2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
index b59b8b3..8f0c43d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -223,6 +223,42 @@ v_tanh_f16_e64 v255, -|0x8000| clamp div:2
v_tanh_f16 v5.l, v128.h
// GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
+v_prng_b32_e64 v5, v1
+// GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
+
+v_prng_b32_e64 v5, v255
+// GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00]
+
+v_prng_b32_e64 v5, s1
+// GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, s105
+// GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, vcc_lo
+// GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, vcc_hi
+// GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, ttmp15
+// GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, m0
+// GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, exec_lo
+// GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, exec_hi
+// GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, null
+// GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00]
+
+v_prng_b32_e64 v5, -1
+// GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
+
v_rcp_bf16_e64 v5, v1
// GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
@@ -714,3 +750,54 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
// GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, s2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_i4_i8 v150.h, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, s2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+v_sat_pk4_u4_u8 v150.h, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:0
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1
+// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
index f7f20f4..b21fca6 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s
@@ -170,6 +170,50 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mas
// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -673,3 +717,19 @@ v_cvt_pk_f16_fp8 v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x04,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
index e1241b0..f14705f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -178,6 +178,50 @@ v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
// GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_half_mirror
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shl:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_shr:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:1
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_ror:15
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -721,3 +765,27 @@ v_cvt_pk_f16_fp8 v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1
v_cvt_pk_f16_fp8 v1, v128.h quad_perm:[0,1,2,3]
// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
index 0106175..b2c2943 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s
@@ -50,6 +50,10 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -249,3 +253,19 @@ v_cvt_pk_f16_fp8 v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1
v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
index 93b86f3..e3c7c0f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -58,6 +58,10 @@ v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
@@ -297,3 +301,27 @@ v_cvt_pk_f16_fp8 v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1
v_cvt_pk_f16_fp8 v1, v128.h dpp8:[7,6,5,4,3,2,1,0]
// GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_i4_i8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_sat_pk4_u4_u8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AsmParser/llvm_section_types.s b/llvm/test/MC/AsmParser/llvm_section_types.s
index 147b1499..83e5db0 100644
--- a/llvm/test/MC/AsmParser/llvm_section_types.s
+++ b/llvm/test/MC/AsmParser/llvm_section_types.s
@@ -1,22 +1,34 @@
-## Verify that LLVM-specific section types are correctly inferred from assembly input.
+## Verify that LLVM-specific section types are correctly inferred from assembly input and printed.
+# RUN: llvm-mc -triple i386-pc-linux %s | FileCheck --check-prefix=ASM %s
# RUN: llvm-mc -triple i386-pc-linux -filetype=obj -o %t %s
# RUN: llvm-readobj -S %t | FileCheck %s
+# ASM: .section .section1,"",@llvm_bb_addr_map
.section .section1,"",@llvm_bb_addr_map
.byte 1
+# ASM: .section .section2,"",@llvm_call_graph_profile
.section .section2,"",@llvm_call_graph_profile
.byte 1
+# ASM: .section .section3,"",@llvm_odrtab
.section .section3,"",@llvm_odrtab
.byte 1
+# ASM: .section .section4,"",@llvm_linker_options
.section .section4,"",@llvm_linker_options
.byte 1
+# ASM: .section .section5,"",@llvm_sympart
.section .section5,"",@llvm_sympart
.byte 1
+# ASM: .section .section6,"",@llvm_dependent_libraries
.section .section6,"",@llvm_dependent_libraries
.byte 1
+# ASM: .section .section7,"",@llvm_offloading
.section .section7,"",@llvm_offloading
.byte 1
+# ASM: .section .section8,"",@llvm_lto
.section .section8,"",@llvm_lto
.byte 1
+# ASM: .section .section9,"",@llvm_cfi_jump_table,1
+.section .section9,"",@llvm_cfi_jump_table,1
+.byte 1
# CHECK: Name: .section1
# CHECK-NEXT: Type: SHT_LLVM_BB_ADDR_MAP
@@ -34,3 +46,6 @@
# CHECK-NEXT: Type: SHT_LLVM_OFFLOADING
# CHECK: Name: .section8
# CHECK-NEXT: Type: SHT_LLVM_LTO
+# CHECK: Name: .section9
+# CHECK-NEXT: Type: SHT_LLVM_CFI_JUMP_TABLE
+# CHECK: EntrySize: 1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
index 5f37ba9..5b90582 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
@@ -200,6 +200,51 @@
0x81,0x95,0x0a,0x7f
# GFX1250-REAL16: v_tanh_bf16_e32 v5.h, v1.h ; encoding: [0x81,0x95,0x0a,0x7f]
+0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf
+# GFX1250: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+0xc1,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0x96,0x0a,0x7e]
+
+0xf0,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0x96,0x0a,0x7e]
+
+0x7f,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0x96,0x0a,0x7e]
+
+0x7e,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0x96,0x0a,0x7e]
+
+0x7d,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, m0 ; encoding: [0x7d,0x96,0x0a,0x7e]
+
+0x7c,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, null ; encoding: [0x7c,0x96,0x0a,0x7e]
+
+0x01,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0x96,0x0a,0x7e]
+
+0x69,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, s105 ; encoding: [0x69,0x96,0x0a,0x7e]
+
+0xfd,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0x96,0x0a,0x7e]
+
+0x7b,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0x96,0x0a,0x7e]
+
+0x01,0x97,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0x97,0x0a,0x7e]
+
+0xff,0x97,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0x97,0x0a,0x7e]
+
+0x6b,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0x96,0x0a,0x7e]
+
+0x6a,0x96,0x0a,0x7e
+# GFX1250: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0x96,0x0a,0x7e]
+
0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00
# GFX1250-REAL16: v_rcp_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
# GFX1250-FAKE16: v_rcp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
@@ -764,3 +809,45 @@
0x03,0xd9,0x02,0x7e
# GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e]
+
+0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+0x82,0xe6,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 2 ; encoding: [0x82,0xe6,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 2 ; encoding: [0x82,0xe6,0x02,0x7e]
+
+0x02,0xe6,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, s2 ; encoding: [0x02,0xe6,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, s2 ; encoding: [0x02,0xe6,0x02,0x7e]
+
+0x02,0xe7,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, v2 ; encoding: [0x02,0xe7,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e]
+
+0x02,0xe7,0x02,0x7f
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.h, v2 ; encoding: [0x02,0xe7,0x02,0x7f]
+
+0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+0x82,0xe8,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 2 ; encoding: [0x82,0xe8,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 2 ; encoding: [0x82,0xe8,0x02,0x7e]
+
+0x02,0xe8,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, s2 ; encoding: [0x02,0xe8,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, s2 ; encoding: [0x02,0xe8,0x02,0x7e]
+
+0x02,0xe9,0x02,0x7e
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, v2 ; encoding: [0x02,0xe9,0x02,0x7e]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e]
+
+0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+
+0x02,0xe9,0x02,0x7f
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.h, v2 ; encoding: [0x02,0xe9,0x02,0x7f]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
index 57bee27..c12ecb8 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
@@ -162,6 +162,45 @@
0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff
# GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff]
+0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff
+# GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+
+0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13
+# GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
+
0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30
# GFX1250-REAL16: v_rcp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
# GFX1250-FAKE16: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
@@ -669,3 +708,25 @@
0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff
# GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
# GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
+
+0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+
+0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+
+0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff]
+
+0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+
+0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+
+0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
index 28ec6b1..d3706f9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
@@ -43,6 +43,15 @@
# GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05]
# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
+0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00
+# GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
+
+0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
+0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05
+# GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05]
+
0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00
# GFX1250-REAL16: v_rcp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
# GFX1250-FAKE16: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
@@ -199,3 +208,25 @@
0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05
# GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
# GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
+
+0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05]
+
+0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+
+0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
index 5004762..1719592 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt
@@ -175,6 +175,42 @@
# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
# GFX1250-FAKE16: v_tanh_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xca,0xd5,0x80,0x01,0x00,0x00]
+0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00]
+
+0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00
+# GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00]
+
0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00
# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
@@ -888,3 +924,59 @@
0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00
# GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v2.h op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
# GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+
+0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+
+0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00]
+
+0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00
+# GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
index de908b9..34d2104a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt
@@ -104,6 +104,39 @@
# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+
+0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff
+# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+
0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30
# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
@@ -704,3 +737,27 @@
0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff
# GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
# GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
+
+0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+
+0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+
+0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+
+0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+
+0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+
+0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
index cfe7173..867fee5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
@@ -34,6 +34,9 @@
# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
+# GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+
0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00
# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
@@ -286,3 +289,27 @@
0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05
# GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
# GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
+
+0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+
+0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05
+# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Hexagon/two_ext.s b/llvm/test/MC/Hexagon/two_ext.s
index 28b2aa3..09b51c5 100644
--- a/llvm/test/MC/Hexagon/two_ext.s
+++ b/llvm/test/MC/Hexagon/two_ext.s
@@ -6,7 +6,7 @@
if (!p1) call foo_b
}
# CHECK: 00004000 { immext(#0)
-# CHECK: 5d004100 if (p1) call 0x0
+# CHECK: 5d004100 if (p1) call 0x0 <.text>
# CHECK: 00004000 immext(#0)
-# CHECK: 5d20c100 if (!p1) call 0x0 }
+# CHECK: 5d20c100 if (!p1) call 0x0 <.text> }
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index 24f3e67..f722584 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -9,6 +9,7 @@
# CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
# CHECK-NEXT: Fixup @0 Value:specifier(19,ext) Kind:4023
# CHECK-NEXT: Symbol @0 $x
+# CHECK-NEXT:8 Data Size:0 []
# CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
# CHECK-NEXT:12 Data Size:4 [13,05,30,00]
# CHECK-NEXT:16 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll
new file mode 100644
index 0000000..c7a0de22
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=loop-reduce -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %p, i64 %idx) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x [4 x i32]], align 16
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 64, ptr [[ALLOCA]])
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 6
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 48
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[ALLOCA]], i64 48
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ -8, %[[ENTRY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[LSR_IV]], 2
+; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[SCEVGEP8]], i64 32
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SCEVGEP9]], align 4
+; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[SCEVGEP6]], i64 8
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SCEVGEP7]], align 4
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SCEVGEP3]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 8
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SCEVGEP5]], align 4
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 8
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SCEVGEP2]], align 4
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 4
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr [[ALLOCA]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %alloca = alloca [4 x [4 x i32]], align 16
+ call void @llvm.lifetime.start.p0(i64 64, ptr %alloca)
+ br label %loop
+
+loop:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ]
+ %gep1 = getelementptr [4 x [12 x [4 x [4 x i32]]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv, i64 0
+ %0 = load i32, ptr %gep1, align 4
+ %gep2 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv
+ %1 = load i32, ptr %gep2, align 4
+ %gep3 = getelementptr [4 x [4 x i32]], ptr %alloca, i64 0, i64 3, i64 %indvars.iv
+ %2 = load i32, ptr %gep3, align 4
+ %gep4 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 %idx, i64 3, i64 %indvars.iv
+ %3 = load i32, ptr %gep4, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv, 1
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ call void @llvm.lifetime.end.p0(i64 64, ptr %alloca)
+ ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index f7e629f..d4e3238 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s -slp-threshold=-3 | FileCheck %s --check-prefix=THRESH
%struct.ImageParameters = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], ptr, ptr, ptr, ptr, ptr, [1200 x %struct.syntaxelement], ptr, ptr, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, ptr, ptr, ptr, ptr, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [15 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], i32, i32, i32 }
%struct.syntaxelement = type { i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr }
@@ -94,6 +95,89 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
; CHECK-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
; CHECK-NEXT: ret i32 0
;
+; THRESH-LABEL: define fastcc i32 @test(
+; THRESH-SAME: i32 [[TMP0:%.*]], i32 [[ADD111_I_I:%.*]], <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; THRESH-NEXT: [[ENTRY:.*:]]
+; THRESH-NEXT: [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1
+; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
+; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
+; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
+; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
+; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
+; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
+; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
+; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
+; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
+; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
+; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
+; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
+; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
+; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
+; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
+; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
+; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
+; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
+; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
+; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
+; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
+; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
+; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
+; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
+; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
+; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
+; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
+; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
+; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
+; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
+; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
+; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
+; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
+; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
+; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
+; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
+; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
+; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
+; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
+; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
+; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
+; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
+; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
+; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
+; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
+; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
+; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
+; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
+; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
+; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
+; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
+; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
+; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
+; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT: ret i32 0
+;
entry:
%LoopArray.sroa.24.0.i.i3 = ashr i32 %0, 1
%shr143.5.i.i9 = ashr i32 %0, 1
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
new file mode 100644
index 0000000..6a4927e
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
@@ -0,0 +1,47 @@
+/// Checks that various hexagon scenarios are handled correctly:
+/// - branch targets
+/// - endloops
+/// - inline-relocs
+/// - multi-insn bundles
+
+{
+ r6 = sub(r1, r0)
+ r7 = and(r4, #0x0)
+ if (p1) jump:t target1
+ if (p2) jump:nt target2
+}
+
+{
+ r8 = r7
+ r9 = add(r8, #0)
+ r10 = memw(r9)
+} :endloop0
+
+{ jump ##sym }
+
+target1:
+ nop
+
+target2:
+ nop
+
+// RUN: llvm-mc %s --triple=hexagon -filetype=obj | llvm-objdump -d -r - | FileCheck %s
+
+// CHECK: 00000000 <.text>:
+// CHECK-NEXT: 0: 12 51 00 5c 5c005112 { if (p1) jump:t 0x24 <target1>
+// CHECK-NEXT: 4: 14 42 00 5c 5c004214 if (p2) jump:nt 0x28 <target2>
+// CHECK-NEXT: 8: 06 41 20 f3 f3204106 r6 = sub(r1,r0)
+// CHECK-NEXT: c: 07 c0 04 76 7604c007 r7 = and(r4,#0x0) }
+// CHECK-NEXT: 10: 08 80 67 70 70678008 { r8 = r7
+// CHECK-NEXT: 14: 09 40 08 b0 b0084009 r9 = add(r8,#0x0)
+// CHECK-NEXT: 18: 0a c0 89 91 9189c00a r10 = memw(r9+#0x0) } :endloop0
+// CHECK-NEXT: 1c: 00 40 00 00 00004000 { immext(#0x0)
+// CHECK-NEXT: 0000001c: R_HEX_B32_PCREL_X sym
+// CHECK-NEXT: 20: 00 c0 00 58 5800c000 jump 0x1c <.text+0x1c> }
+// CHECK-NEXT: 00000020: R_HEX_B22_PCREL_X sym+0x4
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000024 <target1>:
+// CHECK-NEXT: 24: 00 c0 00 7f 7f00c000 { nop }
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000028 <target2>:
+// CHECK-NEXT: 28: 00 c0 00 7f 7f00c000 { nop }
diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp
index 607184e..8672793 100644
--- a/llvm/tools/llvm-mc/Disassembler.cpp
+++ b/llvm/tools/llvm-mc/Disassembler.cpp
@@ -45,7 +45,11 @@ static bool PrintInsts(const MCDisassembler &DisAsm, const ByteArrayTy &Bytes,
MCInst Inst;
MCDisassembler::DecodeStatus S;
- S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
+ if (STI.getTargetTriple().getArch() == Triple::hexagon)
+ S = DisAsm.getInstructionBundle(Inst, Size, Data.slice(Index), Index,
+ nulls());
+ else
+ S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
switch (S) {
case MCDisassembler::Fail:
SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index c5967cd..74eb903 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -693,6 +693,30 @@ public:
} else
OS << "\t<unknown>";
}
+
+ virtual void emitPostInstructionInfo(formatted_raw_ostream &FOS,
+ const MCAsmInfo &MAI,
+ const MCSubtargetInfo &STI,
+ StringRef Comments,
+ LiveVariablePrinter &LVP) {
+ do {
+ if (!Comments.empty()) {
+ // Emit a line of comments.
+ StringRef Comment;
+ std::tie(Comment, Comments) = Comments.split('\n');
+ // MAI.getCommentColumn() assumes that instructions are printed at the
+ // position of 8, while getInstStartColumn() returns the actual
+ // position.
+ unsigned CommentColumn =
+ MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
+ FOS.PadToColumn(CommentColumn);
+ FOS << MAI.getCommentString() << ' ' << Comment;
+ }
+ LVP.printAfterInst(FOS);
+ FOS << "\n";
+ } while (!Comments.empty());
+ FOS.flush();
+ }
};
PrettyPrinter PrettyPrinterInst;
@@ -714,6 +738,35 @@ public:
}
}
}
+
+ std::string getInstructionSeparator() const {
+ SmallString<40> Separator;
+ raw_svector_ostream OS(Separator);
+ if (ShouldClosePacket) {
+ OS << " }";
+ if (IsLoop0 || IsLoop1)
+ OS << " ";
+ if (IsLoop0)
+ OS << (IsLoop1 ? ":endloop01" : ":endloop0");
+ else if (IsLoop1)
+ OS << ":endloop1";
+ }
+ OS << '\n';
+ return OS.str().str();
+ }
+
+ void emitPostInstructionInfo(formatted_raw_ostream &FOS, const MCAsmInfo &MAI,
+ const MCSubtargetInfo &STI, StringRef Comments,
+ LiveVariablePrinter &LVP) override {
+ // Hexagon does not write anything to the comment stream, so we can just
+ // print the separator.
+ LVP.printAfterInst(FOS);
+ FOS << getInstructionSeparator();
+ FOS.flush();
+ if (ShouldClosePacket)
+ reset();
+ }
+
void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
object::SectionedAddress Address, formatted_raw_ostream &OS,
StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
@@ -724,60 +777,64 @@ public:
if (!MI) {
printLead(Bytes, Address.Address, OS);
OS << " <unknown>";
+ reset();
return;
}
- std::string Buffer;
+
+ StringRef Preamble = IsStartOfBundle ? " { " : " ";
+
+ if (SP && (PrintSource || PrintLines))
+ SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
+ printLead(Bytes, Address.Address, OS);
+ OS << Preamble;
+ std::string Buf;
{
- raw_string_ostream TempStream(Buffer);
+ raw_string_ostream TempStream(Buf);
IP.printInst(MI, Address.Address, "", STI, TempStream);
}
- StringRef Contents(Buffer);
- // Split off bundle attributes
- auto PacketBundle = Contents.rsplit('\n');
- // Split off first instruction from the rest
- auto HeadTail = PacketBundle.first.split('\n');
- auto Preamble = " { ";
- auto Separator = "";
-
- // Hexagon's packets require relocations to be inline rather than
- // clustered at the end of the packet.
- std::vector<RelocationRef>::const_iterator RelCur = Rels->begin();
- std::vector<RelocationRef>::const_iterator RelEnd = Rels->end();
- auto PrintReloc = [&]() -> void {
- while ((RelCur != RelEnd) && (RelCur->getOffset() <= Address.Address)) {
- if (RelCur->getOffset() == Address.Address) {
- printRelocation(OS, ObjectFilename, *RelCur, Address.Address, false);
- return;
- }
- ++RelCur;
- }
- };
+ StringRef Contents(Buf);
+
+ auto Duplex = Contents.split('\v');
+ bool HasDuplex = !Duplex.second.empty();
+ if (HasDuplex) {
+ OS << Duplex.first;
+ OS << "; ";
+ OS << Duplex.second;
+ } else {
+ OS << Duplex.first;
+ }
- while (!HeadTail.first.empty()) {
- OS << Separator;
- Separator = "\n";
- if (SP && (PrintSource || PrintLines))
- SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
- printLead(Bytes, Address.Address, OS);
- OS << Preamble;
- Preamble = " ";
- StringRef Inst;
- auto Duplex = HeadTail.first.split('\v');
- if (!Duplex.second.empty()) {
- OS << Duplex.first;
- OS << "; ";
- Inst = Duplex.second;
- }
+ uint32_t Instruction = support::endian::read32le(Bytes.data());
+
+ uint32_t ParseMask = 0x0000c000;
+ uint32_t PacketEndMask = 0x0000c000;
+ uint32_t LoopEndMask = 0x00008000;
+ uint32_t ParseBits = Instruction & ParseMask;
+
+ if (ParseBits == LoopEndMask) {
+ if (IsStartOfBundle)
+ IsLoop0 = true;
else
- Inst = HeadTail.first;
- OS << Inst;
- HeadTail = HeadTail.second.split('\n');
- if (HeadTail.first.empty())
- OS << " } " << PacketBundle.second;
- PrintReloc();
- Bytes = Bytes.slice(4);
- Address.Address += 4;
+ IsLoop1 = true;
}
+
+ IsStartOfBundle = false;
+
+ if (ParseBits == PacketEndMask || HasDuplex)
+ ShouldClosePacket = true;
+ }
+
+private:
+ bool IsStartOfBundle = true;
+ bool IsLoop0 = false;
+ bool IsLoop1 = false;
+ bool ShouldClosePacket = false;
+
+ void reset() {
+ IsStartOfBundle = true;
+ IsLoop0 = false;
+ IsLoop1 = false;
+ ShouldClosePacket = false;
}
};
HexagonPrettyPrinter HexagonPrettyPrinterInst;
@@ -1610,29 +1667,6 @@ static StringRef getSegmentName(const MachOObjectFile *MachO,
return "";
}
-static void emitPostInstructionInfo(formatted_raw_ostream &FOS,
- const MCAsmInfo &MAI,
- const MCSubtargetInfo &STI,
- StringRef Comments,
- LiveVariablePrinter &LVP) {
- do {
- if (!Comments.empty()) {
- // Emit a line of comments.
- StringRef Comment;
- std::tie(Comment, Comments) = Comments.split('\n');
- // MAI.getCommentColumn() assumes that instructions are printed at the
- // position of 8, while getInstStartColumn() returns the actual position.
- unsigned CommentColumn =
- MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
- FOS.PadToColumn(CommentColumn);
- FOS << MAI.getCommentString() << ' ' << Comment;
- }
- LVP.printAfterInst(FOS);
- FOS << '\n';
- } while (!Comments.empty());
- FOS.flush();
-}
-
static void createFakeELFSections(ObjectFile &Obj) {
assert(Obj.isELF());
if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(&Obj))
@@ -2526,15 +2560,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
}
assert(DT->Context->getAsmInfo());
- emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
- *DT->SubtargetInfo, CommentStream.str(), LVP);
+ DT->Printer->emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
+ *DT->SubtargetInfo,
+ CommentStream.str(), LVP);
Comments.clear();
if (BTF)
printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
- // Hexagon handles relocs in pretty printer
- if (InlineRelocs && Obj.getArch() != Triple::hexagon) {
+ if (InlineRelocs) {
while (findRel()) {
// When --adjust-vma is used, update the address printed.
printRelocation(FOS, Obj.getFileName(), *RelCur,
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 465c1896..ccc64fe 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -5511,7 +5511,7 @@ template <typename ELFT> static GNUAbiTag getGNUAbiTag(ArrayRef<uint8_t> Desc) {
return {"", "", /*IsValid=*/false};
static const char *OSNames[] = {
- "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl",
+ "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable",
};
StringRef OSName = "Unknown";
if (Words[0] < std::size(OSNames))
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index e3d500a..4a8f53c 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -10,6 +10,11 @@
#include "CountCopyAndMove.h"
#include "gtest/gtest.h"
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
namespace {
template <typename T>
@@ -142,6 +147,26 @@ TEST(TransformTest, MoveTransformLlvm) {
EXPECT_EQ(0, CountCopyAndMove::Destructions);
}
+TEST(TransformTest, TransformCategory) {
+ struct StructA {
+ int x;
+ };
+ struct StructB : StructA {
+ StructB(StructA &&A) : StructA(std::move(A)) {}
+ };
+
+ std::optional<StructA> A{StructA{}};
+ llvm::transformOptional(A, [](auto &&s) {
+ EXPECT_FALSE(std::is_rvalue_reference_v<decltype(s)>);
+ return StructB{std::move(s)};
+ });
+
+ llvm::transformOptional(std::move(A), [](auto &&s) {
+ EXPECT_TRUE(std::is_rvalue_reference_v<decltype(s)>);
+ return StructB{std::move(s)};
+ });
+}
+
TEST(TransformTest, ToUnderlying) {
enum E { A1 = 0, B1 = -1 };
static_assert(llvm::to_underlying(A1) == 0);
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 50346c2..b07ea9e9 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -2114,7 +2114,8 @@ void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef,
const Record *WRDef = ProcWriteResDef->getValueAsDef("WriteType");
if (!WRMap.try_emplace(WRDef, ProcWriteResDef).second)
PrintFatalError(ProcWriteResDef->getLoc(),
- "WriteType already used in another WriteRes");
+ "WriteType of " + WRDef->getName() +
+ " already used in another WriteRes");
}
// Visit ProcResourceKinds referenced by the newly discovered WriteRes.
@@ -2148,7 +2149,8 @@ void CodeGenSchedModels::addReadAdvance(const Record *ProcReadAdvanceDef,
const Record *RADef = ProcReadAdvanceDef->getValueAsDef("ReadType");
if (!RAMap.try_emplace(RADef, ProcReadAdvanceDef).second)
PrintFatalError(ProcReadAdvanceDef->getLoc(),
- "ReadType already used in another ReadAdvance");
+ "ReadType of " + RADef->getName() +
+ " already used in another ReadAdvance");
}
}
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 82ec812..5309b5d 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -35,7 +35,6 @@ if (current_toolchain == default_toolchain) {
"_LIBCPP_HAS_LOCALIZATION=1",
"_LIBCPP_HAS_UNICODE=1",
"_LIBCPP_HAS_WIDE_CHARACTERS=1",
- "_LIBCPP_HAS_NO_STD_MODULES=",
"_LIBCPP_HAS_TERMINAL=1",
"_LIBCPP_INSTRUMENTED_WITH_ASAN=",
"_LIBCPP_ABI_DEFINES=",
diff --git a/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h b/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h
index ac49159..ff99d7c 100644
--- a/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h
+++ b/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h
@@ -21,9 +21,6 @@ class RewritePatternSet;
/// Collect a set of patterns to convert from the vector to XeGPU ops.
void populateVectorToXeGPUConversionPatterns(RewritePatternSet &patterns);
-/// Create a pass to convert ops from vector to XeGPU.
-std::unique_ptr<Pass> createConvertVectorToXeGPUPass();
-
} // namespace mlir
#endif // MLIR_CONVERSION_VECTORTOXEGPU_VECTORTOXEGPU_H
diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
index 2d9befe..2016bea 100644
--- a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td
@@ -77,4 +77,11 @@ def SPIRVWebGPUPreparePass : Pass<"spirv-webgpu-prepare", "spirv::ModuleOp"> {
"and replacing with supported ones";
}
+def SPIRVReplicatedConstantCompositePass
+ : Pass<"spirv-promote-to-replicated-constants", "spirv::ModuleOp"> {
+ let summary = "Convert splat composite constants and spec constants to "
+ "corresponding replicated constant composite ops defined by "
+ "SPV_EXT_replicated_composites";
+}
+
#endif // MLIR_DIALECT_SPIRV_TRANSFORMS_PASSES
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
index db244d1..0b7ffa4 100644
--- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
@@ -16,7 +16,9 @@
#include "mlir/Dialect/EmitC/IR/EmitC.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeRange.h"
#include "mlir/Transforms/DialectConversion.h"
using namespace mlir;
@@ -77,13 +79,23 @@ struct ConvertAlloca final : public OpConversionPattern<memref::AllocaOp> {
}
};
+Type convertMemRefType(MemRefType opTy, const TypeConverter *typeConverter) {
+ Type resultTy;
+ if (opTy.getRank() == 0) {
+ resultTy = typeConverter->convertType(mlir::getElementTypeOrSelf(opTy));
+ } else {
+ resultTy = typeConverter->convertType(opTy);
+ }
+ return resultTy;
+}
+
struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
using OpConversionPattern::OpConversionPattern;
LogicalResult
matchAndRewrite(memref::GlobalOp op, OpAdaptor operands,
ConversionPatternRewriter &rewriter) const override {
-
+ MemRefType opTy = op.getType();
if (!op.getType().hasStaticShape()) {
return rewriter.notifyMatchFailure(
op.getLoc(), "cannot transform global with dynamic shape");
@@ -95,7 +107,9 @@ struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
op.getLoc(), "global variable with alignment requirement is "
"currently not supported");
}
- auto resultTy = getTypeConverter()->convertType(op.getType());
+
+ Type resultTy = convertMemRefType(opTy, getTypeConverter());
+
if (!resultTy) {
return rewriter.notifyMatchFailure(op.getLoc(),
"cannot convert result type");
@@ -114,6 +128,10 @@ struct ConvertGlobal final : public OpConversionPattern<memref::GlobalOp> {
bool externSpecifier = !staticSpecifier;
Attribute initialValue = operands.getInitialValueAttr();
+ if (opTy.getRank() == 0) {
+ auto elementsAttr = llvm::cast<ElementsAttr>(*op.getInitialValue());
+ initialValue = elementsAttr.getSplatValue<Attribute>();
+ }
if (isa_and_present<UnitAttr>(initialValue))
initialValue = {};
@@ -132,11 +150,23 @@ struct ConvertGetGlobal final
matchAndRewrite(memref::GetGlobalOp op, OpAdaptor operands,
ConversionPatternRewriter &rewriter) const override {
- auto resultTy = getTypeConverter()->convertType(op.getType());
+ MemRefType opTy = op.getType();
+ Type resultTy = convertMemRefType(opTy, getTypeConverter());
+
if (!resultTy) {
return rewriter.notifyMatchFailure(op.getLoc(),
"cannot convert result type");
}
+
+ if (opTy.getRank() == 0) {
+ emitc::LValueType lvalueType = emitc::LValueType::get(resultTy);
+ emitc::GetGlobalOp globalLValue = rewriter.create<emitc::GetGlobalOp>(
+ op.getLoc(), lvalueType, operands.getNameAttr());
+ emitc::PointerType pointerType = emitc::PointerType::get(resultTy);
+ rewriter.replaceOpWithNewOp<emitc::ApplyOp>(
+ op, pointerType, rewriter.getStringAttr("&"), globalLValue);
+ return success();
+ }
rewriter.replaceOpWithNewOp<emitc::GetGlobalOp>(op, resultTy,
operands.getNameAttr());
return success();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index 5a10883..28d99b1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -893,6 +893,13 @@ struct PackOpTiling
SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
packOp.getDimAndTileMapping();
+ SmallVector<int64_t> outerShapeWithoutTranspose(
+ packOp.getDestType().getShape().take_front(packOp.getSourceRank()));
+ if (!packOp.getOuterDimsPerm().empty()) {
+ applyPermutationToVector(
+ outerShapeWithoutTranspose,
+ invertPermutationVector(packOp.getOuterDimsPerm()));
+ }
for (auto dim : llvm::seq<int64_t>(packOp.getSourceRank())) {
if (dimAndTileMapping.count(dim)) {
FailureOr<int64_t> cstTileSize =
@@ -904,14 +911,16 @@ struct PackOpTiling
// If a dimension is not tiled, it is always valid to fuse the pack op,
// even if the op has padding semantics. Because it always generates a
- // full slice along the dimension.
+ // full slice along the dimension. The tile sizes are for unpacked
+ // domain, i.e., `srcDimSize`, so `tileSize < srcDimSize` means that the
+ // dimension is tiled.
// TODO: It could be untiled if the `srcDimSize` is dynamic. It is a
// hard check to determine if a dimension is tiled or not.
int64_t srcDimSize = packOp.getSourceType().getDimSize(dim);
- int64_t destDimSize = packOp.getDestType().getDimSize(dim);
+ int64_t destDimSize = outerShapeWithoutTranspose[dim];
bool isTiled = failed(cstTileSize) ||
ShapedType::isDynamic(srcDimSize) ||
- cstTileSize.value() != srcDimSize;
+ cstTileSize.value() < srcDimSize;
if (!isTiled) {
outerDimOffsets.push_back(offsets[dim]);
if (ShapedType::isStatic(destDimSize)) {
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
index 68e0206..b947447 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
set(LLVM_OPTIONAL_SOURCES
CanonicalizeGLPass.cpp
+ ConvertToReplicatedConstantCompositePass.cpp
DecorateCompositeTypeLayoutPass.cpp
LowerABIAttributesPass.cpp
RewriteInsertsPass.cpp
@@ -30,6 +31,7 @@ add_mlir_dialect_library(MLIRSPIRVConversion
add_mlir_dialect_library(MLIRSPIRVTransforms
CanonicalizeGLPass.cpp
+ ConvertToReplicatedConstantCompositePass.cpp
DecorateCompositeTypeLayoutPass.cpp
LowerABIAttributesPass.cpp
RewriteInsertsPass.cpp
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp
new file mode 100644
index 0000000..dbbe23a
--- /dev/null
+++ b/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp
@@ -0,0 +1,129 @@
+//===- ConvertToReplicatedConstantCompositePass.cpp -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert a splat composite spirv.Constant and
+// spirv.SpecConstantComposite to spirv.EXT.ConstantCompositeReplicate and
+// spirv.EXT.SpecConstantCompositeReplicate respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir::spirv {
+#define GEN_PASS_DEF_SPIRVREPLICATEDCONSTANTCOMPOSITEPASS
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h.inc"
+
+namespace {
+
+static Type getArrayElemType(Attribute attr) {
+ if (auto typedAttr = dyn_cast<TypedAttr>(attr)) {
+ return typedAttr.getType();
+ }
+
+ if (auto arrayAttr = dyn_cast<ArrayAttr>(attr)) {
+ return ArrayType::get(getArrayElemType(arrayAttr[0]), arrayAttr.size());
+ }
+
+ return nullptr;
+}
+
+static std::pair<Attribute, uint32_t>
+getSplatAttrAndNumElements(Attribute valueAttr, Type valueType) {
+ auto compositeType = dyn_cast_or_null<spirv::CompositeType>(valueType);
+ if (!compositeType)
+ return {nullptr, 1};
+
+ if (auto splatAttr = dyn_cast<SplatElementsAttr>(valueAttr)) {
+ return {splatAttr.getSplatValue<Attribute>(), splatAttr.size()};
+ }
+
+ if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+ if (llvm::all_equal(arrayAttr)) {
+ Attribute attr = arrayAttr[0];
+ uint32_t numElements = arrayAttr.size();
+
+ // Find the inner-most splat value for array of composites
+ auto [newAttr, newNumElements] =
+ getSplatAttrAndNumElements(attr, getArrayElemType(attr));
+ if (newAttr) {
+ attr = newAttr;
+ numElements *= newNumElements;
+ }
+ return {attr, numElements};
+ }
+ }
+
+ return {nullptr, 1};
+}
+
+struct ConstantOpConversion final : OpRewritePattern<spirv::ConstantOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(spirv::ConstantOp op,
+ PatternRewriter &rewriter) const override {
+ auto [attr, numElements] =
+ getSplatAttrAndNumElements(op.getValue(), op.getType());
+ if (!attr)
+ return rewriter.notifyMatchFailure(op, "composite is not splat");
+
+ if (numElements == 1)
+ return rewriter.notifyMatchFailure(op,
+ "composite has only one constituent");
+
+ rewriter.replaceOpWithNewOp<spirv::EXTConstantCompositeReplicateOp>(
+ op, op.getType(), attr);
+ return success();
+ }
+};
+
+struct SpecConstantCompositeOpConversion final
+ : OpRewritePattern<spirv::SpecConstantCompositeOp> {
+ using OpRewritePattern::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(spirv::SpecConstantCompositeOp op,
+ PatternRewriter &rewriter) const override {
+ auto compositeType = dyn_cast_or_null<spirv::CompositeType>(op.getType());
+ if (!compositeType)
+ return rewriter.notifyMatchFailure(op, "not a composite constant");
+
+ ArrayAttr constituents = op.getConstituents();
+ if (constituents.size() == 1)
+ return rewriter.notifyMatchFailure(op,
+ "composite has only one consituent");
+
+ if (!llvm::all_equal(constituents))
+ return rewriter.notifyMatchFailure(op, "composite is not splat");
+
+ auto splatConstituent = dyn_cast<FlatSymbolRefAttr>(constituents[0]);
+ if (!splatConstituent)
+ return rewriter.notifyMatchFailure(
+ op, "expected flat symbol reference for splat constituent");
+
+ rewriter.replaceOpWithNewOp<spirv::EXTSpecConstantCompositeReplicateOp>(
+ op, TypeAttr::get(op.getType()), op.getSymNameAttr(), splatConstituent);
+
+ return success();
+ }
+};
+
+struct ConvertToReplicatedConstantCompositePass final
+ : spirv::impl::SPIRVReplicatedConstantCompositePassBase<
+ ConvertToReplicatedConstantCompositePass> {
+ void runOnOperation() override {
+ MLIRContext *context = &getContext();
+ RewritePatternSet patterns(context);
+ patterns.add<ConstantOpConversion, SpecConstantCompositeOpConversion>(
+ context);
+ walkAndApplyPatterns(getOperation(), std::move(patterns));
+ }
+};
+
+} // namespace
+} // namespace mlir::spirv
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
index d37fd1d..2b4eda3 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
@@ -41,6 +41,8 @@ func.func @memref_load(%buff : memref<4x8xf32>, %i: index, %j: index) -> f32 {
module @globals {
memref.global "private" constant @internal_global : memref<3x7xf32> = dense<4.0>
// CHECK-NEXT: emitc.global static const @internal_global : !emitc.array<3x7xf32> = dense<4.000000e+00>
+ memref.global "private" constant @__constant_xi32 : memref<i32> = dense<-1>
+ // CHECK-NEXT: emitc.global static const @__constant_xi32 : i32 = -1
memref.global @public_global : memref<3x7xf32>
// CHECK-NEXT: emitc.global extern @public_global : !emitc.array<3x7xf32>
memref.global @uninitialized_global : memref<3x7xf32> = uninitialized
@@ -50,6 +52,9 @@ module @globals {
func.func @use_global() {
// CHECK-NEXT: emitc.get_global @public_global : !emitc.array<3x7xf32>
%0 = memref.get_global @public_global : memref<3x7xf32>
+ // CHECK-NEXT: emitc.get_global @__constant_xi32 : !emitc.lvalue<i32>
+ // CHECK-NEXT: emitc.apply "&"(%1) : (!emitc.lvalue<i32>) -> !emitc.ptr<i32>
+ %1 = memref.get_global @__constant_xi32 : memref<i32>
return
}
}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir b/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir
new file mode 100644
index 0000000..56e26ee
--- /dev/null
+++ b/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir
@@ -0,0 +1,283 @@
+// RUN: mlir-opt --spirv-promote-to-replicated-constants --split-input-file %s | FileCheck %s
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+ spirv.func @splat_vector_of_i32() -> (vector<3xi32>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : vector<3xi32>
+ %0 = spirv.Constant dense<2> : vector<3xi32>
+ spirv.ReturnValue %0 : vector<3xi32>
+ }
+
+ spirv.func @splat_array_of_i32() -> (!spirv.array<3 x i32>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<3 x i32>
+ %0 = spirv.Constant [1 : i32, 1 : i32, 1 : i32] : !spirv.array<3 x i32>
+ spirv.ReturnValue %0 : !spirv.array<3 x i32>
+ }
+
+ spirv.func @splat_array_of_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>>
+ %0 = spirv.Constant [[3 : i32, 3 : i32, 3 : i32], [3 : i32, 3 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+ }
+
+ spirv.func @splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+ %0 = spirv.Constant [[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+ }
+
+ spirv.func @splat_array_of_vectors_of_i32() -> (!spirv.array<2xvector<2xi32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+ %0 = spirv.Constant [dense<[1, 2]> : vector<2xi32>, dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+ }
+
+ spirv.func @splat_array_of_splat_vectors_of_i32() -> (!spirv.array<2 x vector<2xi32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.array<2 x vector<2xi32>>
+ %0 = spirv.Constant [dense<2> : vector<2xi32>, dense<2> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+ }
+
+ spirv.func @splat_tensor_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>>
+ %0 = spirv.Constant dense<3> : tensor<2x3xi32> : !spirv.array<2 x !spirv.array<3 x i32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>>
+ }
+
+ spirv.func @splat_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.arm.tensor<2x3xi32>
+ %0 = spirv.Constant dense<2> : !spirv.arm.tensor<2x3xi32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32>
+ }
+
+ spirv.func @array_of_splat_array_of_non_splat_vectors_of_i32() -> (!spirv.array<1 x !spirv.array<2 x vector<2xi32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>
+ %0 = spirv.Constant [[dense<[1, 2]> : vector<2xi32>, dense<[1, 2]> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>>
+ spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>>
+ }
+
+ spirv.func @array_of_one_splat_array_of_vector_of_one_i32() -> !spirv.array<1 x !spirv.array<2 x vector<1xi32>>> "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1> : vector<1xi32>] : !spirv.array<1 x !spirv.array<2 x vector<1xi32>
+ %cst = spirv.Constant [[dense<1> : vector<1xi32>], [dense<1> : vector<1xi32>]] : !spirv.array<1 x !spirv.array<2 x vector<1xi32>>>
+ spirv.ReturnValue %cst : !spirv.array<1 x !spirv.array<2 x vector<1xi32>>>
+ }
+
+ spirv.func @splat_array_of_array_of_one_vector_of_one_i32() -> (!spirv.array<2 x !spirv.array<1 x vector<1xi32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1> : vector<1xi32>] : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>>
+ %0 = spirv.Constant [[dense<1> : vector<1xi32>], [dense<1> : vector<1xi32>]] : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>>
+ }
+
+ spirv.func @array_of_one_array_of_one_splat_vector_of_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xi32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+ %0 = spirv.Constant [[dense<1> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+ spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+ }
+
+ spirv.func @splat_array_of_splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>
+ %0 = spirv.Constant [[[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]], [[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>
+ }
+
+ spirv.func @splat_vector_of_f32() -> (vector<3xf32>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : vector<3xf32>
+ %0 = spirv.Constant dense<2.0> : vector<3xf32>
+ spirv.ReturnValue %0 : vector<3xf32>
+ }
+
+ spirv.func @splat_array_of_f32() -> (!spirv.array<3 x f32>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<3 x f32>
+ %0 = spirv.Constant [1.0 : f32, 1.0 : f32, 1.0 : f32] : !spirv.array<3 x f32>
+ spirv.ReturnValue %0 : !spirv.array<3 x f32>
+ }
+
+ spirv.func @splat_array_of_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>>
+ %0 = spirv.Constant [[3.0 : f32, 3.0 : f32, 3.0 : f32], [3.0 : f32, 3.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+ }
+
+ spirv.func @splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+ %0 = spirv.Constant [[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+ }
+
+ spirv.func @splat_array_of_vectors_of_f32() -> (!spirv.array<2xvector<2xf32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+ %0 = spirv.Constant [dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 2.0]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+ }
+
+ spirv.func @splat_array_of_splat_vectors_of_f32() -> (!spirv.array<2 x vector<2xf32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.array<2 x vector<2xf32>>
+ %0 = spirv.Constant [dense<2.0> : vector<2xf32>, dense<2.0> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+ }
+
+ spirv.func @splat_tensor_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>>
+ %0 = spirv.Constant dense<3.0> : tensor<2x3xf32> : !spirv.array<2 x !spirv.array<3 x f32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>>
+ }
+
+ spirv.func @splat_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.arm.tensor<2x3xf32>
+ %0 = spirv.Constant dense<2.0> : !spirv.arm.tensor<2x3xf32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xf32>
+ }
+
+ spirv.func @array_of_splat_array_of_non_splat_vectors_of_f32() -> (!spirv.array<1 x !spirv.array<2 x vector<2xf32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>
+ %0 = spirv.Constant [[dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 2.0]> : vector<2xf32>]] : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>>
+ spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>>
+ }
+
+ spirv.func @array_of_one_splat_array_of_vector_of_one_f32() -> !spirv.array<1 x !spirv.array<2 x vector<1xf32>>> "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1.000000e+00> : vector<1xf32>] : !spirv.array<1 x !spirv.array<2 x vector<1xf32>
+ %cst = spirv.Constant [[dense<1.0> : vector<1xf32>], [dense<1.0> : vector<1xf32>]] : !spirv.array<1 x !spirv.array<2 x vector<1xf32>>>
+ spirv.ReturnValue %cst : !spirv.array<1 x !spirv.array<2 x vector<1xf32>>>
+ }
+
+ spirv.func @splat_array_of_array_of_one_vector_of_one_f32() -> (!spirv.array<2 x !spirv.array<1 x vector<1xf32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1.000000e+00> : vector<1xf32>] : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>>
+ %0 = spirv.Constant [[dense<1.0> : vector<1xf32>], [dense<1.0> : vector<1xf32>]] : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>>
+ }
+
+ spirv.func @array_of_one_array_of_one_splat_vector_of_f32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xf32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>>
+ %0 = spirv.Constant [[dense<1.0> : vector<2xf32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>>
+ spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>>
+ }
+
+ spirv.func @splat_array_of_splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>) "None" {
+ // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>
+ %0 = spirv.Constant [[[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]], [[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>
+ spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>
+ }
+
+ spirv.func @array_of_one_i32() -> (!spirv.array<1 x i32>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant [1 : i32] : !spirv.array<1 x i32>
+ spirv.ReturnValue %0 : !spirv.array<1 x i32>
+ }
+
+ spirv.func @arm_tensor_of_one_i32() -> (!spirv.arm.tensor<1xi32>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant dense<1> : !spirv.arm.tensor<1xi32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<1xi32>
+ }
+
+ spirv.func @non_splat_vector_of_i32() -> (vector<3xi32>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant dense<[0, 1, 2]> : vector<3xi32>
+ spirv.ReturnValue %0 : vector<3xi32>
+ }
+
+ spirv.func @non_splat_array_of_vectors_of_i32() -> (!spirv.array<2xvector<2xi32>>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant [dense<[1, 2]> : vector<2xi32>, dense<[1, 3]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>>
+ }
+
+ spirv.func @array_of_one_f32() -> (!spirv.array<1 x f32>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant [1.0 : f32] : !spirv.array<1 x f32>
+ spirv.ReturnValue %0 : !spirv.array<1 x f32>
+ }
+
+ spirv.func @arm_tensor_of_one_f32() -> (!spirv.arm.tensor<1xf32>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant dense<1.0> : !spirv.arm.tensor<1xf32>
+ spirv.ReturnValue %0 : !spirv.arm.tensor<1xf32>
+ }
+
+ spirv.func @non_splat_vector_of_f32() -> (vector<3xf32>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant dense<[0.0, 1.0, 2.0]> : vector<3xf32>
+ spirv.ReturnValue %0 : vector<3xf32>
+ }
+
+ spirv.func @non_splat_array_of_vectors_of_f32() -> (!spirv.array<2xvector<2xf32>>) "None" {
+ // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant [dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 3.0]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>>
+ spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>>
+ }
+
+ spirv.func @array_of_one_array_of_one_non_splat_vector_of_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xi32>>>) "None" {
+ // CHECK-NOT spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant [[dense<[1, 2]> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+ spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>>
+ }
+
+ spirv.func @array_of_one_array_of_one_vector_of_one_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<1xi32>>>) "None" {
+ // CHECK-NOT spirv.EXT.ConstantCompositeReplicate
+ %0 = spirv.Constant [[dense<1> : vector<1xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<1xi32>>>
+ spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<1xi32>>>
+ }
+}
+
+// -----
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ReplicatedCompositesEXT], [SPV_EXT_replicated_composites]> {
+
+ spirv.SpecConstant @sc_i32_1 = 1 : i32
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_i32 (@sc_i32_1) : !spirv.array<3 x i32>
+ spirv.SpecConstantComposite @scc_splat_array_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.array<3 x i32>
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_i32 (@sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+ spirv.SpecConstantComposite @scc_splat_struct_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.struct<(i32, i32, i32)>
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_i32 (@sc_i32_1) : vector<3xi32>
+ spirv.SpecConstantComposite @scc_splat_vector_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : vector<3 x i32>
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_arm_tensor_of_i32 (@sc_i32_1) : !spirv.arm.tensor<3xi32>
+ spirv.SpecConstantComposite @scc_splat_arm_tensor_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.arm.tensor<3xi32>
+
+ spirv.SpecConstant @sc_f32_1 = 1.0 : f32
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_f32 (@sc_f32_1) : !spirv.array<3 x f32>
+ spirv.SpecConstantComposite @scc_splat_array_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.array<3 x f32>
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_f32 (@sc_f32_1) : !spirv.struct<(f32, f32, f32)>
+ spirv.SpecConstantComposite @scc_splat_struct_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.struct<(f32, f32, f32)>
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_f32 (@sc_f32_1) : vector<3xf32>
+ spirv.SpecConstantComposite @scc_splat_vector_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : vector<3 x f32>
+
+ // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_arm_tensor_of_f32 (@sc_f32_1) : !spirv.arm.tensor<3xf32>
+ spirv.SpecConstantComposite @scc_splat_arm_tensor_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.arm.tensor<3xf32>
+
+ spirv.SpecConstant @sc_i32_2 = 2 : i32
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_array_of_one_i32 (@sc_i32_1) : !spirv.array<1 x i32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_arm_tensor_of_one_i32 (@sc_i32_1) : !spirv.arm.tensor<1xi32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_non_splat_vector_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_2) : vector<3 x i32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_non_splat_arm_tensor_of_i32 (@sc_i32_2, @sc_i32_1, @sc_i32_1) : !spirv.arm.tensor<3xi32>
+
+ spirv.SpecConstant @sc_f32_2 = 2.0 : f32
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_array_of_one_f32 (@sc_f32_1) : !spirv.array<1 x f32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_arm_tensor_of_one_f32 (@sc_f32_1) : !spirv.arm.tensor<1xf32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_non_splat_vector_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_2) : vector<3 x f32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_non_splat_arm_tensor_of_f32 (@sc_f32_2, @sc_f32_1, @sc_f32_1) : !spirv.arm.tensor<3xf32>
+
+ // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate
+ spirv.SpecConstantComposite @scc_struct_of_i32_and_f32 (@sc_i32_1, @sc_i32_1, @sc_f32_1) : !spirv.struct<(i32, i32, f32)>
+}
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index 7b0a849..cdbca72 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -451,6 +451,101 @@ module attributes {transform.with_named_sequence} {
// -----
+#map = affine_map<(d0) -> (-d0 + 4, 16)>
+func.func @fuse_pack_consumer_if_single_iteration(%arg0: tensor<4x4xf32>) -> tensor<1x4x16x1xf32> {
+ %0 = tensor.empty() : tensor<1x4x16x1xf32>
+ %1 = tensor.empty() : tensor<4x4xf32>
+ %2 = scf.forall (%arg1) = (0) to (4) step (16) shared_outs(%arg2 = %1) -> (tensor<4x4xf32>) {
+ %3 = affine.min #map(%arg1)
+ %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [%3, 4] [1, 1] : tensor<4x4xf32> to tensor<?x4xf32>
+ %extracted_slice_0 = tensor.extract_slice %arg2[%arg1, 0] [%3, 4] [1, 1] : tensor<4x4xf32> to tensor<?x4xf32>
+ %4 = linalg.exp ins(%extracted_slice : tensor<?x4xf32>) outs(%extracted_slice_0 : tensor<?x4xf32>) -> tensor<?x4xf32>
+ scf.forall.in_parallel {
+ tensor.parallel_insert_slice %4 into %arg2[%arg1, 0] [%3, 4] [1, 1] : tensor<?x4xf32> into tensor<4x4xf32>
+ }
+ }
+ %cst = arith.constant 0.000000e+00 : f32
+ %pack = linalg.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %0 : tensor<4x4xf32> -> tensor<1x4x16x1xf32>
+ return %pack : tensor<1x4x16x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+// CHECK: #[[MAP:.*]] = affine_map<(d0) -> (-d0 + 4, 16)>
+// CHECK: func.func @fuse_pack_consumer_if_single_iteration(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-DAG: %[[PACK_INIT:.*]] = tensor.empty() : tensor<1x4x16x1xf32>
+// CHECK-DAG: %[[ELEM_INIT:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK-DAG: %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (4) step (16)
+// CHECK-SAME: shared_outs(%[[ELEM_OUT_ARG:.*]] = %[[ELEM_INIT]], %[[PACK_OUT_ARG:.*]] = %[[PACK_INIT]])
+// CHECK-DAG: %[[SIZE:.+]] = affine.min #[[MAP]](%[[IV]])
+// CHECK-DAG: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1]
+// CHECK-DAG: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[ELEM_OUT_ARG]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1]
+// CHECK: %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME: ins(%[[ELEM_SRC]]
+// CHECK-SAME: outs(%[[ELEM_DEST]]
+// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[IV]], 0, 0, 0] [1, 4, 16, 1] [1, 1, 1, 1]
+// CHECK: %[[PACK:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME: padding_value(%[[PAD_VAL]] : f32)
+// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1]
+// CHECK-SAME: into %[[TILED_PACK_DEST]]
+// CHECK: scf.forall.in_parallel {
+// CHECK: tensor.parallel_insert_slice %[[ELEM]] into %[[ELEM_OUT_ARG]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1]
+// CHECK: tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][%[[IV]], 0, 0, 0] [1, 4, 16, 1] [1, 1, 1, 1]
+
+// -----
+
+func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>, %arg2: tensor<2x64x16x1xf32>) -> tensor<2x64x16x1xf32> {
+ %0 = scf.forall (%arg3) = (0) to (32) step (16) shared_outs(%arg4 = %arg1) -> (tensor<64x32xf32>) {
+ %src = tensor.extract_slice %arg0[0, %arg3] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+ %dest = tensor.extract_slice %arg4[0, %arg3] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32>
+ %1 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32>
+ scf.forall.in_parallel {
+ tensor.parallel_insert_slice %1 into %arg4[0, %arg3] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32>
+ }
+ }
+ %pack = linalg.pack %0 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %arg2 : tensor<64x32xf32> -> tensor<2x64x16x1xf32>
+ return %pack : tensor<2x64x16x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.yield
+ }
+}
+// CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+// CHECK: func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm(
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]
+// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16)
+// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[ARG2]])
+// CHECK: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1]
+// CHECK: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+// CHECK: %[[ELEM:.*]] = linalg.exp
+// CHECK-SAME: ins(%[[ELEM_SRC]]
+// CHECK-SAME: outs(%[[ELEM_DEST]]
+// CHECK-DAG: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]])
+// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], 0, 0, 0] [1, 64, 16, 1] [1, 1, 1, 1]
+// CHECK: %[[PACK:.*]] = linalg.pack %[[ELEM]]
+// CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1]
+// CHECK-SAME: into %[[TILED_PACK_DEST]]
+// CHECK: scf.forall.in_parallel {
+// CHECK: tensor.parallel_insert_slice %[[ELEM]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1]
+// CHECK: tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], 0, 0, 0] [1, 64, 16, 1] [1, 1, 1, 1]
+
+// -----
+
// It is valid to fuse the pack op in perfect tiling scenario when the dimension
// is dynamic and padding is not needed.
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f62cabe..307dc62 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -34,15 +34,6 @@
#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1
-#ifdef BUILD_TIED_TASK_STACK
-#define TASK_STACK_EMPTY 0 // entries when the stack is empty
-#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
-// Number of entries in each task stack array
-#define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS)
-// Mask for determining index into stack block
-#define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1)
-#endif // BUILD_TIED_TASK_STACK
-
#define TASK_NOT_PUSHED 1
#define TASK_SUCCESSFULLY_PUSHED 0
#define TASK_TIED 1
@@ -2704,23 +2695,6 @@ extern std::atomic<kmp_int32> __kmp_tdg_task_id;
extern kmp_int32 __kmp_num_tdg;
#endif
-#ifdef BUILD_TIED_TASK_STACK
-
-/* Tied Task stack definitions */
-typedef struct kmp_stack_block {
- kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE];
- struct kmp_stack_block *sb_next;
- struct kmp_stack_block *sb_prev;
-} kmp_stack_block_t;
-
-typedef struct kmp_task_stack {
- kmp_stack_block_t ts_first_block; // first block of stack entries
- kmp_taskdata_t **ts_top; // pointer to the top of stack
- kmp_int32 ts_entries; // number of entries on the stack
-} kmp_task_stack_t;
-
-#endif // BUILD_TIED_TASK_STACK
-
typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
/* Same fields as in the #else branch, but in reverse order */
@@ -2863,10 +2837,6 @@ typedef struct kmp_base_thread_data {
kmp_int32 td_deque_ntasks; // Number of tasks in deque
// GEH: shouldn't this be volatile since used in while-spin?
kmp_int32 td_deque_last_stolen; // Thread number of last successful steal
-#ifdef BUILD_TIED_TASK_STACK
- kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task
-// scheduling constraint
-#endif // BUILD_TIED_TASK_STACK
} kmp_base_thread_data_t;
#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index d7bc492..e4d92a7 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -42,221 +42,6 @@ static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
int __kmp_taskloop_task(int gtid, void *ptask);
#endif
-#ifdef BUILD_TIED_TASK_STACK
-
-// __kmp_trace_task_stack: print the tied tasks from the task stack in order
-// from top do bottom
-//
-// gtid: global thread identifier for thread containing stack
-// thread_data: thread data for task team thread containing stack
-// threshold: value above which the trace statement triggers
-// location: string identifying call site of this function (for trace)
-static void __kmp_trace_task_stack(kmp_int32 gtid,
- kmp_thread_data_t *thread_data,
- int threshold, char *location) {
- kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
- kmp_taskdata_t **stack_top = task_stack->ts_top;
- kmp_int32 entries = task_stack->ts_entries;
- kmp_taskdata_t *tied_task;
-
- KA_TRACE(
- threshold,
- ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
- "first_block = %p, stack_top = %p \n",
- location, gtid, entries, task_stack->ts_first_block, stack_top));
-
- KMP_DEBUG_ASSERT(stack_top != NULL);
- KMP_DEBUG_ASSERT(entries > 0);
-
- while (entries != 0) {
- KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
- // fix up ts_top if we need to pop from previous block
- if (entries & TASK_STACK_INDEX_MASK == 0) {
- kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
-
- stack_block = stack_block->sb_prev;
- stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
- }
-
- // finish bookkeeping
- stack_top--;
- entries--;
-
- tied_task = *stack_top;
-
- KMP_DEBUG_ASSERT(tied_task != NULL);
- KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
-
- KA_TRACE(threshold,
- ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
- "stack_top=%p, tied_task=%p\n",
- location, gtid, entries, stack_top, tied_task));
- }
- KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
-
- KA_TRACE(threshold,
- ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
- location, gtid));
-}
-
-// __kmp_init_task_stack: initialize the task stack for the first time
-// after a thread_data structure is created.
-// It should not be necessary to do this again (assuming the stack works).
-//
-// gtid: global thread identifier of calling thread
-// thread_data: thread data for task team thread containing stack
-static void __kmp_init_task_stack(kmp_int32 gtid,
- kmp_thread_data_t *thread_data) {
- kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
- kmp_stack_block_t *first_block;
-
- // set up the first block of the stack
- first_block = &task_stack->ts_first_block;
- task_stack->ts_top = (kmp_taskdata_t **)first_block;
- memset((void *)first_block, '\0',
- TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
-
- // initialize the stack to be empty
- task_stack->ts_entries = TASK_STACK_EMPTY;
- first_block->sb_next = NULL;
- first_block->sb_prev = NULL;
-}
-
-// __kmp_free_task_stack: free the task stack when thread_data is destroyed.
-//
-// gtid: global thread identifier for calling thread
-// thread_data: thread info for thread containing stack
-static void __kmp_free_task_stack(kmp_int32 gtid,
- kmp_thread_data_t *thread_data) {
- kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
- kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
-
- KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
- // free from the second block of the stack
- while (stack_block != NULL) {
- kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
-
- stack_block->sb_next = NULL;
- stack_block->sb_prev = NULL;
- if (stack_block != &task_stack->ts_first_block) {
- __kmp_thread_free(thread,
- stack_block); // free the block, if not the first
- }
- stack_block = next_block;
- }
- // initialize the stack to be empty
- task_stack->ts_entries = 0;
- task_stack->ts_top = NULL;
-}
-
-// __kmp_push_task_stack: Push the tied task onto the task stack.
-// Grow the stack if necessary by allocating another block.
-//
-// gtid: global thread identifier for calling thread
-// thread: thread info for thread containing stack
-// tied_task: the task to push on the stack
-static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
- kmp_taskdata_t *tied_task) {
- // GEH - need to consider what to do if tt_threads_data not allocated yet
- kmp_thread_data_t *thread_data =
- &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
- kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
-
- if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
- return; // Don't push anything on stack if team or team tasks are serialized
- }
-
- KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
- KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
-
- KA_TRACE(20,
- ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
- gtid, thread, tied_task));
- // Store entry
- *(task_stack->ts_top) = tied_task;
-
- // Do bookkeeping for next push
- task_stack->ts_top++;
- task_stack->ts_entries++;
-
- if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
- // Find beginning of this task block
- kmp_stack_block_t *stack_block =
- (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
-
- // Check if we already have a block
- if (stack_block->sb_next !=
- NULL) { // reset ts_top to beginning of next block
- task_stack->ts_top = &stack_block->sb_next->sb_block[0];
- } else { // Alloc new block and link it up
- kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
- thread, sizeof(kmp_stack_block_t));
-
- task_stack->ts_top = &new_block->sb_block[0];
- stack_block->sb_next = new_block;
- new_block->sb_prev = stack_block;
- new_block->sb_next = NULL;
-
- KA_TRACE(
- 30,
- ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
- gtid, tied_task, new_block));
- }
- }
- KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
- tied_task));
-}
-
-// __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
-// the task, just check to make sure it matches the ending task passed in.
-//
-// gtid: global thread identifier for the calling thread
-// thread: thread info structure containing stack
-// tied_task: the task popped off the stack
-// ending_task: the task that is ending (should match popped task)
-static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
- kmp_taskdata_t *ending_task) {
- // GEH - need to consider what to do if tt_threads_data not allocated yet
- kmp_thread_data_t *thread_data =
- &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
- kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
- kmp_taskdata_t *tied_task;
-
- if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
- // Don't pop anything from stack if team or team tasks are serialized
- return;
- }
-
- KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
- KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
-
- KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
- thread));
-
- // fix up ts_top if we need to pop from previous block
- if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
- kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
-
- stack_block = stack_block->sb_prev;
- task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
- }
-
- // finish bookkeeping
- task_stack->ts_top--;
- task_stack->ts_entries--;
-
- tied_task = *(task_stack->ts_top);
-
- KMP_DEBUG_ASSERT(tied_task != NULL);
- KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
- KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
-
- KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
- tied_task));
- return;
-}
-#endif /* BUILD_TIED_TASK_STACK */
-
// returns 1 if new task is allowed to execute, 0 otherwise
// checks Task Scheduling constraint (if requested) and
// mutexinoutset dependencies if any
@@ -683,13 +468,6 @@ static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
// KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
current_task->td_flags.executing = 0;
-// Add task to stack if tied
-#ifdef BUILD_TIED_TASK_STACK
- if (taskdata->td_flags.tiedness == TASK_TIED) {
- __kmp_push_task_stack(gtid, thread, taskdata);
- }
-#endif /* BUILD_TIED_TASK_STACK */
-
// mark starting task as executing and as current task
thread->th.th_current_task = taskdata;
@@ -1041,13 +819,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
is_taskgraph = taskdata->is_taskgraph;
#endif
-// Pop task from stack if tied
-#ifdef BUILD_TIED_TASK_STACK
- if (taskdata->td_flags.tiedness == TASK_TIED) {
- __kmp_pop_task_stack(gtid, thread, taskdata);
- }
-#endif /* BUILD_TIED_TASK_STACK */
-
if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
// untied task needs to check the counter so that the task structure is not
// freed prematurely
@@ -3786,13 +3557,6 @@ static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
thread_data->td.td_deque = NULL;
__kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
}
-
-#ifdef BUILD_TIED_TASK_STACK
- // GEH: Figure out what to do here for td_susp_tied_tasks
- if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
- __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
- }
-#endif // BUILD_TIED_TASK_STACK
}
// __kmp_realloc_task_threads_data:
@@ -3849,14 +3613,7 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
(void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
-#ifdef BUILD_TIED_TASK_STACK
- // GEH: Figure out if this is the right thing to do
- for (i = maxthreads; i < nthreads; i++) {
- kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
- __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
- }
-#endif // BUILD_TIED_TASK_STACK
- // Install the new data and free the old data
+ // Install the new data and free the old data
(*threads_data_p) = new_data;
__kmp_free(old_data);
} else {
@@ -3868,13 +3625,6 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
// kmp_reap_task_team( ).
*threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
nthreads * sizeof(kmp_thread_data_t));
-#ifdef BUILD_TIED_TASK_STACK
- // GEH: Figure out if this is the right thing to do
- for (i = 0; i < nthreads; i++) {
- kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
- __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
- }
-#endif // BUILD_TIED_TASK_STACK
}
task_team->tt.tt_max_threads = nthreads;
} else {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e3d807a..f0b45a9 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2073,6 +2073,7 @@ libc_support_library(
":__support_fputil_fp_bits",
":__support_fputil_nearest_integer",
":__support_math_expf16_utils",
+ ":__support_math_exp10_float16_constants",
],
)
@@ -2276,6 +2277,38 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_math_exp10_float16_constants",
+ hdrs = ["src/__support/math/exp10_float16_constants.h"],
+ deps = [
+ ":__support_cpp_array",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_exp10f16_utils",
+ hdrs = ["src/__support/math/exp10f16_utils.h"],
+ deps = [
+ ":__support_math_exp10_float16_constants",
+ ":__support_math_expf16_utils",
+ ":__support_fputil_fp_bits",
+ ],
+)
+
+libc_support_library(
+ name = "__support_math_exp10f16",
+ hdrs = ["src/__support/math/exp10f16.h"],
+ deps = [
+ ":__support_math_exp10f16_utils",
+ ":__support_fputil_fp_bits",
+ ":__support_fputil_cast",
+ ":__support_fputil_rounding_mode",
+ ":__support_fputil_except_value_utils",
+ ":__support_macros_optimization",
+ ":__support_macros_properties_cpu_features",
+ ],
+)
+
############################### complex targets ################################
libc_function(
@@ -2896,14 +2929,15 @@ libc_math_function(
libc_math_function(
name = "exp10f16",
additional_deps = [
- ":expxf16",
+ ":__support_math_exp10f16",
+ ":errno",
],
)
libc_math_function(
name = "exp10m1f16",
additional_deps = [
- ":expxf16",
+ ":__support_math_exp10f16_utils",
],
)