diff options
155 files changed, 11484 insertions, 13275 deletions
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index b445dcf..0f765e9 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -81,7 +81,7 @@ CodeAction toCodeAction(const ClangdServer::CodeActionResult::Rename &R, const URIForFile &File) { CodeAction CA; CA.title = R.FixMessage; - CA.kind = std::string(CodeAction::REFACTOR_KIND); + CA.kind = std::string(CodeAction::QUICKFIX_KIND); CA.command.emplace(); CA.command->title = R.FixMessage; CA.command->command = std::string(ApplyRenameCommand); diff --git a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp index 2c7f50d..95bf5e5 100644 --- a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp +++ b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp @@ -235,7 +235,8 @@ TEST_F(LSPTest, ClangTidyRename) { .takeValue() .getAsArray())[0]; - ASSERT_EQ((*RenameCommand.getAsObject())["title"], "change 'foo' to 'Foo'"); + ASSERT_EQ((*RenameCommand.getAsObject())["title"], + "Apply fix: change 'foo' to 'Foo'"); Client.expectServerCall("workspace/applyEdit"); Client.call("workspace/executeCommand", RenameCommand); diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h index 9383856..8d52e13 100644 --- a/clang/include/clang/Basic/OpenACCKinds.h +++ b/clang/include/clang/Basic/OpenACCKinds.h @@ -494,7 +494,7 @@ inline StreamTy &printOpenACCClauseKind(StreamTy &Out, OpenACCClauseKind K) { case OpenACCClauseKind::Shortloop: llvm_unreachable("Shortloop shouldn't be generated in clang"); - LLVM_FALLTHROUGH; + [[fallthrough]]; case OpenACCClauseKind::Invalid: return Out << "<invalid>"; } diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h index 1ace535..bea5416 100644 --- a/clang/include/clang/ExtractAPI/API.h +++ b/clang/include/clang/ExtractAPI/API.h @@ -618,17 +618,17 @@ struct TagRecord : APIRecord, RecordContext { static bool classofKind(RecordKind K) { switch (K) { case RK_Enum: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_Struct: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_Union: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_CXXClass: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_ClassTemplate: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_ClassTemplateSpecialization: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_ClassTemplatePartialSpecialization: return true; default: @@ -704,15 +704,15 @@ struct RecordRecord : TagRecord { static bool classofKind(RecordKind K) { switch (K) { case RK_Struct: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_Union: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_CXXClass: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_ClassTemplate: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_ClassTemplateSpecialization: - LLVM_FALLTHROUGH; + [[fallthrough]]; case RK_ClassTemplatePartialSpecialization: return true; default: diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 87de9e6..d4de704 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -27,11 +27,11 @@ namespace targets { // getPointerWidthV(). static const char *const DataLayoutStringR600 = - "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; static const char *const DataLayoutStringAMDGCN = - "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" + "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-" "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-" "v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"; diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 82b71e3..2429a43 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -7795,7 +7795,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: ExtractLow = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm: case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, @@ -7803,7 +7803,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm: ExtractLow = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm: case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, @@ -7811,7 +7811,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm: ExtractLow = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm: case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, @@ -7819,7 +7819,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm: ExtractLow = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm: case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, @@ -7854,7 +7854,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm: case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm: case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm: return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane, @@ -7866,7 +7866,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm: case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm: case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm: return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, @@ -7898,37 +7898,37 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, "vmlall"); case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm: return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalb_lane, ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane"); case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm: return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalt_lane, ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane"); case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm: return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbb_lane, ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm: return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbt_lane, ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm: return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltb_lane, ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm: ExtendLaneArg = true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm: return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltt_lane, ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane"); diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index 9b6b72b1..e50f06c 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -193,11 +193,11 @@ llvm::Type *SystemZABIInfo::getFPArgumentType(QualType Ty, case BuiltinType::Float16: if (Size == 16) return llvm::Type::getHalfTy(getVMContext()); - LLVM_FALLTHROUGH; + [[fallthrough]]; case BuiltinType::Float: if (Size == 32) return llvm::Type::getFloatTy(getVMContext()); - LLVM_FALLTHROUGH; + [[fallthrough]]; case BuiltinType::Double: return llvm::Type::getDoubleTy(getVMContext()); default: diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp index 2869549..5c8891f 100644 --- a/clang/lib/Driver/ToolChains/HLSL.cpp +++ b/clang/lib/Driver/ToolChains/HLSL.cpp @@ -218,7 +218,6 @@ void getSpirvExtOperand(StringRef SpvExtensionArg, raw_ostream &out) { if (SpvExtensionArg.compare_insensitive("DXC") == 0) { bool first = true; - std::string Operand; for (StringRef E : DxcSupportedExtensions) { if (!first) out << ","; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 2879743..dec71191 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2569,6 +2569,12 @@ bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) { if (IsEnum && !Style.AllowShortEnumsOnASingleLine) addUnwrappedLine(); break; + case tok::kw_requires: { + auto *RequiresToken = FormatTok; + nextToken(); + parseRequiresExpression(RequiresToken); + break; + } default: nextToken(); break; diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp index 54abfa6..a55b7f5 100644 --- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp +++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp @@ -555,7 +555,7 @@ llvm::Expected<Expr *> Interpreter::convertExprToValue(Expr *E) { InterfaceKind Kind = V.computeInterfaceKind(DesugaredTy); switch (Kind) { case InterfaceKind::WithAlloc: - LLVM_FALLTHROUGH; + [[fallthrough]]; case InterfaceKind::CopyArray: { // __clang_Interpreter_SetValueWithAlloc. ExprResult AllocCall = diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 9abaf79..140b709 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2609,7 +2609,6 @@ public: bool IsRelatedToDecl, ASTContext &Ctx) override { SourceLocation Loc; - std::string Message; Loc = Node.get<Stmt>()->getBeginLoc(); S.Diag(Loc, diag::warn_unsafe_buffer_usage_unique_ptr_array_access) diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 8aebf53..db14349 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -1265,7 +1265,7 @@ checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, return true; case NotGSLPointer: IsGslPtrValueFromGslTempOwner = false; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Report: break; } diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 9cbd1bd..7c44efd 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -614,8 +614,7 @@ ConstraintSatisfactionChecker::SubstitutionInTemplateArguments( for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) { TemplateArgument Arg; if (Used[I]) - Arg = S.Context.getCanonicalTemplateArgument( - CTAI.SugaredConverted[MappedIndex++]); + Arg = CTAI.SugaredConverted[MappedIndex++]; if (I < SubstitutedOuterMost.size()) { SubstitutedOuterMost[I] = Arg; Offset = I + 1; diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp index af2dce8..5f1243a 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp @@ -149,7 +149,7 @@ namespace std_example { template<typename T> constexpr bool is_same_v<T, T> = true; template<typename T, typename U> concept same_as = is_same_v<T, U>; - // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}} + // expected-note@-1 {{because 'is_same_v<int, typename T2::inner>' evaluated to false}} static_assert(C1<int>); static_assert(C1<int*>); @@ -160,7 +160,7 @@ namespace std_example { template<typename T> concept C2 = requires(T x) { {*x} -> same_as<typename T::inner>; - // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}} + // expected-note@-1{{because 'same_as<int, typename T2::inner>' evaluated to false}} // expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}} }; diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp index 70a96be..9fc4906 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp @@ -27,7 +27,7 @@ using r4i = X<void>::r4<int>; // expected-error{{constraints not satisfied for c // C++ [expr.prim.req.nested] Examples namespace std_example { - template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(int) == 1' (4 == 1) evaluated to false}} + template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(decltype(+t)) == 1' (4 == 1) evaluated to false}} template<typename T> concept D = requires (T t) { requires C1<decltype (+t)>; // expected-note{{because 'decltype(+t)' (aka 'int') does not satisfy 'C1'}} diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp index c0406f8..4f192d3 100644 --- a/clang/test/CXX/temp/temp.param/p10-2a.cpp +++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp @@ -95,8 +95,8 @@ concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf // expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}} // expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}} // expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}} -// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}} -// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}} +// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}} +// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}} // expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}} template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U> diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index eecee69..e950794 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -152,20 +152,20 @@ // RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=R600 -// R600: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" +// R600: target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" // RUN: %clang_cc1 -triple r600-unknown -target-cpu cayman -o - -emit-llvm %s \ // RUN: | FileCheck %s -check-prefix=R600D -// R600D: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" +// R600D: target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \ // RUN: | FileCheck %s -check-prefix=R600SI -// R600SI: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" // Test default -target-cpu // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \ // RUN: | FileCheck %s -check-prefix=R600SIDefault -// R600SIDefault: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=AARCH64 diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl index 713ae48..72ce726 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl @@ -1,5 +1,5 @@ // RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s // RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s -// CHECK: target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" void foo(void) {} diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl index 999372c..3f0a37d 100644 --- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl @@ -20,7 +20,7 @@ Buffer<double2> r4; // expected-error@+4 {{constraints not satisfied for class template 'Buffer'}} // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}} // expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(Buffer<int>)' evaluated to false}} Buffer<Buffer<int> > r5; struct s { @@ -66,7 +66,7 @@ Buffer<half[4]> r10; typedef vector<int, 8> int8; // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}} // expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(int8)' evaluated to false}} Buffer<int8> r11; typedef int MyInt; @@ -91,7 +91,7 @@ Buffer<numbers> r15; // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}} // expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(double3)' evaluated to false}} Buffer<double3> r16; diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl index b33f2af..aa36c48 100644 --- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl @@ -20,7 +20,7 @@ RWBuffer<double2> r4; // expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}} // expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(RWBuffer<int>)' evaluated to false}} RWBuffer<RWBuffer<int> > r5; struct s { @@ -66,7 +66,7 @@ RWBuffer<half[4]> r10; typedef vector<int, 8> int8; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(int8)' evaluated to false}} RWBuffer<int8> r11; typedef int MyInt; @@ -91,7 +91,7 @@ RWBuffer<numbers> r15; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(double3)' evaluated to false}} RWBuffer<double3> r16; diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp index 73dce93..d36c6a8 100644 --- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp +++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp @@ -68,8 +68,8 @@ struct my_range{ void baz() { auto it = begin(rng); // #BEGIN_CALL // expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}} -// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}} -// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}} +// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}} +// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}} // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}} // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}} // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}} diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp index 3fbe7c0..ee2bb8d 100644 --- a/clang/test/SemaTemplate/concepts.cpp +++ b/clang/test/SemaTemplate/concepts.cpp @@ -833,13 +833,13 @@ struct Parent { static_assert(Parent<void>::TakesUnary<int, 0>::i == 0); // expected-error@+3{{constraints not satisfied for class template 'TakesUnary'}} // expected-note@#UNARY{{because 'decltype(0ULL)' (aka 'unsigned long long') does not satisfy 'C'}} -// expected-note@#61777_C{{because 'sizeof(unsigned long long) == 4' (8 == 4) evaluated to false}} +// expected-note@#61777_C{{because 'sizeof(decltype(0ULL)) == 4' (8 == 4) evaluated to false}} static_assert(Parent<void>::TakesUnary<int, 0uLL>::i == 0); static_assert(Parent<int>::TakesBinary<int, 0>::i == 0); // expected-error@+3{{constraints not satisfied for class template 'TakesBinary'}} // expected-note@#BINARY{{because 'C2<decltype(0ULL), int>' evaluated to false}} -// expected-note@#61777_C2{{because 'sizeof(unsigned long long) == sizeof(int)' (8 == 4) evaluated to false}} +// expected-note@#61777_C2{{because 'sizeof(decltype(0ULL)) == sizeof(int)' (8 == 4) evaluated to false}} static_assert(Parent<int>::TakesBinary<int, 0ULL>::i == 0); } @@ -1329,8 +1329,8 @@ static_assert(__cpp17_iterator<not_move_constructible>); \ // expected-error {{static assertion failed}} \ // expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \ // expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \ -// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \ -// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}} +// expected-note@#__cpp17_move_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \ +// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<not_move_constructible>' evaluated to false}} } namespace case4 { diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index 3bd4a6e..3508238 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests LoggerTest.cpp MapLatticeTest.cpp MatchSwitchTest.cpp + MockHeaders.cpp MultiVarConstantPropagationTest.cpp RecordOpsTest.cpp SignAnalysisTest.cpp diff --git a/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp b/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp new file mode 100644 index 0000000..c280921 --- /dev/null +++ b/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp @@ -0,0 +1,1259 @@ +//===--- MockHeaders.cpp - Mock headers for dataflow analyses -*- C++ ---*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines mock headers for testing of dataflow analyses. +// +//===----------------------------------------------------------------------===// + +#include "MockHeaders.h" + +namespace clang { +namespace dataflow { +namespace test { +static constexpr char CStdDefHeader[] = R"( +#ifndef CSTDDEF_H +#define CSTDDEF_H + +namespace std { + +typedef decltype(sizeof(char)) size_t; + +using nullptr_t = decltype(nullptr); + +} // namespace std + +#endif // CSTDDEF_H +)"; + +static constexpr char StdTypeTraitsHeader[] = R"( +#ifndef STD_TYPE_TRAITS_H +#define STD_TYPE_TRAITS_H + +#include "cstddef.h" + +namespace std { + +template <typename T, T V> +struct integral_constant { + static constexpr T value = V; +}; + +using true_type = integral_constant<bool, true>; +using false_type = integral_constant<bool, false>; + +template< class T > struct remove_reference {typedef T type;}; +template< class T > struct remove_reference<T&> {typedef T type;}; +template< class T > struct remove_reference<T&&> {typedef T type;}; + +template <class T> + using remove_reference_t = typename remove_reference<T>::type; + +template <class T> +struct remove_extent { + typedef T type; +}; + +template <class T> +struct remove_extent<T[]> { + typedef T type; +}; + +template <class T, size_t N> +struct remove_extent<T[N]> { + typedef T type; +}; + +template <class T> +struct is_array : false_type {}; + +template <class T> +struct is_array<T[]> : true_type {}; + +template <class T, size_t N> +struct is_array<T[N]> : true_type {}; + +template <class> +struct is_function : false_type {}; + +template <class Ret, class... Args> +struct is_function<Ret(Args...)> : true_type {}; + +namespace detail { + +template <class T> +struct type_identity { + using type = T; +}; // or use type_identity (since C++20) + +template <class T> +auto try_add_pointer(int) -> type_identity<typename remove_reference<T>::type*>; +template <class T> +auto try_add_pointer(...) -> type_identity<T>; + +} // namespace detail + +template <class T> +struct add_pointer : decltype(detail::try_add_pointer<T>(0)) {}; + +template <bool B, class T, class F> +struct conditional { + typedef T type; +}; + +template <class T, class F> +struct conditional<false, T, F> { + typedef F type; +}; + +template <class T> +struct remove_cv { + typedef T type; +}; +template <class T> +struct remove_cv<const T> { + typedef T type; +}; +template <class T> +struct remove_cv<volatile T> { + typedef T type; +}; +template <class T> +struct remove_cv<const volatile T> { + typedef T type; +}; + +template <class T> +using remove_cv_t = typename remove_cv<T>::type; + +template <class T> +struct decay { + private: + typedef typename remove_reference<T>::type U; + + public: + typedef typename conditional< + is_array<U>::value, typename remove_extent<U>::type*, + typename conditional<is_function<U>::value, typename add_pointer<U>::type, + typename remove_cv<U>::type>::type>::type type; +}; + +template <bool B, class T = void> +struct enable_if {}; + +template <class T> +struct enable_if<true, T> { + typedef T type; +}; + +template <bool B, class T = void> +using enable_if_t = typename enable_if<B, T>::type; + +template <class T, class U> +struct is_same : false_type {}; + +template <class T> +struct is_same<T, T> : true_type {}; + +template <class T> +struct is_void : is_same<void, typename remove_cv<T>::type> {}; + +namespace detail { + +template <class T> +auto try_add_lvalue_reference(int) -> type_identity<T&>; +template <class T> +auto try_add_lvalue_reference(...) -> type_identity<T>; + +template <class T> +auto try_add_rvalue_reference(int) -> type_identity<T&&>; +template <class T> +auto try_add_rvalue_reference(...) -> type_identity<T>; + +} // namespace detail + +template <class T> +struct add_lvalue_reference : decltype(detail::try_add_lvalue_reference<T>(0)) { +}; + +template <class T> +struct add_rvalue_reference : decltype(detail::try_add_rvalue_reference<T>(0)) { +}; + +template <class T> +typename add_rvalue_reference<T>::type declval() noexcept; + +namespace detail { + +template <class T> +auto test_returnable(int) + -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{}); +template <class> +auto test_returnable(...) -> false_type; + +template <class From, class To> +auto test_implicitly_convertible(int) + -> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{}); +template <class, class> +auto test_implicitly_convertible(...) -> false_type; + +} // namespace detail + +template <class From, class To> +struct is_convertible + : integral_constant<bool, + (decltype(detail::test_returnable<To>(0))::value && + decltype(detail::test_implicitly_convertible<From, To>( + 0))::value) || + (is_void<From>::value && is_void<To>::value)> {}; + +template <class From, class To> +inline constexpr bool is_convertible_v = is_convertible<From, To>::value; + +template <class...> +using void_t = void; + +template <class, class T, class... Args> +struct is_constructible_ : false_type {}; + +template <class T, class... Args> +struct is_constructible_<void_t<decltype(T(declval<Args>()...))>, T, Args...> + : true_type {}; + +template <class T, class... Args> +using is_constructible = is_constructible_<void_t<>, T, Args...>; + +template <class T, class... Args> +inline constexpr bool is_constructible_v = is_constructible<T, Args...>::value; + +template <class _Tp> +struct __uncvref { + typedef typename remove_cv<typename remove_reference<_Tp>::type>::type type; +}; + +template <class _Tp> +using __uncvref_t = typename __uncvref<_Tp>::type; + +template <bool _Val> +using _BoolConstant = integral_constant<bool, _Val>; + +template <class _Tp, class _Up> +using _IsSame = _BoolConstant<__is_same(_Tp, _Up)>; + +template <class _Tp, class _Up> +using _IsNotSame = _BoolConstant<!__is_same(_Tp, _Up)>; + +template <bool> +struct _MetaBase; +template <> +struct _MetaBase<true> { + template <class _Tp, class _Up> + using _SelectImpl = _Tp; + template <template <class...> class _FirstFn, template <class...> class, + class... _Args> + using _SelectApplyImpl = _FirstFn<_Args...>; + template <class _First, class...> + using _FirstImpl = _First; + template <class, class _Second, class...> + using _SecondImpl = _Second; + template <class _Result, class _First, class... _Rest> + using _OrImpl = + typename _MetaBase<_First::value != true && sizeof...(_Rest) != 0>:: + template _OrImpl<_First, _Rest...>; +}; + +template <> +struct _MetaBase<false> { + template <class _Tp, class _Up> + using _SelectImpl = _Up; + template <template <class...> class, template <class...> class _SecondFn, + class... _Args> + using _SelectApplyImpl = _SecondFn<_Args...>; + template <class _Result, class...> + using _OrImpl = _Result; +}; + +template <bool _Cond, class _IfRes, class _ElseRes> +using _If = typename _MetaBase<_Cond>::template _SelectImpl<_IfRes, _ElseRes>; + +template <class... _Rest> +using _Or = typename _MetaBase<sizeof...(_Rest) != + 0>::template _OrImpl<false_type, _Rest...>; + +template <bool _Bp, class _Tp = void> +using __enable_if_t = typename enable_if<_Bp, _Tp>::type; + +template <class...> +using __expand_to_true = true_type; +template <class... _Pred> +__expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int); +template <class...> +false_type __and_helper(...); +template <class... _Pred> +using _And = decltype(__and_helper<_Pred...>(0)); + +template <class _Pred> +struct _Not : _BoolConstant<!_Pred::value> {}; + +struct __check_tuple_constructor_fail { + static constexpr bool __enable_explicit_default() { return false; } + static constexpr bool __enable_implicit_default() { return false; } + template <class...> + static constexpr bool __enable_explicit() { + return false; + } + template <class...> + static constexpr bool __enable_implicit() { + return false; + } +}; + +template <typename, typename _Tp> +struct __select_2nd { + typedef _Tp type; +}; +template <class _Tp, class _Arg> +typename __select_2nd<decltype((declval<_Tp>() = declval<_Arg>())), + true_type>::type +__is_assignable_test(int); +template <class, class> +false_type __is_assignable_test(...); +template <class _Tp, class _Arg, + bool = is_void<_Tp>::value || is_void<_Arg>::value> +struct __is_assignable_imp + : public decltype((__is_assignable_test<_Tp, _Arg>(0))) {}; +template <class _Tp, class _Arg> +struct __is_assignable_imp<_Tp, _Arg, true> : public false_type {}; +template <class _Tp, class _Arg> +struct is_assignable : public __is_assignable_imp<_Tp, _Arg> {}; + +template <class _Tp> +struct __libcpp_is_integral : public false_type {}; +template <> +struct __libcpp_is_integral<bool> : public true_type {}; +template <> +struct __libcpp_is_integral<char> : public true_type {}; +template <> +struct __libcpp_is_integral<signed char> : public true_type {}; +template <> +struct __libcpp_is_integral<unsigned char> : public true_type {}; +template <> +struct __libcpp_is_integral<wchar_t> : public true_type {}; +template <> +struct __libcpp_is_integral<short> : public true_type {}; // NOLINT +template <> +struct __libcpp_is_integral<unsigned short> : public true_type {}; // NOLINT +template <> +struct __libcpp_is_integral<int> : public true_type {}; +template <> +struct __libcpp_is_integral<unsigned int> : public true_type {}; +template <> +struct __libcpp_is_integral<long> : public true_type {}; // NOLINT +template <> +struct __libcpp_is_integral<unsigned long> : public true_type {}; // NOLINT +template <> +struct __libcpp_is_integral<long long> : public true_type {}; // NOLINT +template <> // NOLINTNEXTLINE +struct __libcpp_is_integral<unsigned long long> : public true_type {}; +template <class _Tp> +struct is_integral + : public __libcpp_is_integral<typename remove_cv<_Tp>::type> {}; + +template <class _Tp> +struct __libcpp_is_floating_point : public false_type {}; +template <> +struct __libcpp_is_floating_point<float> : public true_type {}; +template <> +struct __libcpp_is_floating_point<double> : public true_type {}; +template <> +struct __libcpp_is_floating_point<long double> : public true_type {}; +template <class _Tp> +struct is_floating_point + : public __libcpp_is_floating_point<typename remove_cv<_Tp>::type> {}; + +template <class _Tp> +struct is_arithmetic + : public integral_constant<bool, is_integral<_Tp>::value || + is_floating_point<_Tp>::value> {}; + +template <class _Tp> +struct __libcpp_is_pointer : public false_type {}; +template <class _Tp> +struct __libcpp_is_pointer<_Tp*> : public true_type {}; +template <class _Tp> +struct is_pointer : public __libcpp_is_pointer<typename remove_cv<_Tp>::type> { +}; + +template <class _Tp> +struct __libcpp_is_member_pointer : public false_type {}; +template <class _Tp, class _Up> +struct __libcpp_is_member_pointer<_Tp _Up::*> : public true_type {}; +template <class _Tp> +struct is_member_pointer + : public __libcpp_is_member_pointer<typename remove_cv<_Tp>::type> {}; + +template <class _Tp> +struct __libcpp_union : public false_type {}; +template <class _Tp> +struct is_union : public __libcpp_union<typename remove_cv<_Tp>::type> {}; + +template <class T> +struct is_reference : false_type {}; +template <class T> +struct is_reference<T&> : true_type {}; +template <class T> +struct is_reference<T&&> : true_type {}; + +template <class T> +inline constexpr bool is_reference_v = is_reference<T>::value; + +struct __two { + char __lx[2]; +}; + +namespace __is_class_imp { +template <class _Tp> +char __test(int _Tp::*); +template <class _Tp> +__two __test(...); +} // namespace __is_class_imp +template <class _Tp> +struct is_class + : public integral_constant<bool, + sizeof(__is_class_imp::__test<_Tp>(0)) == 1 && + !is_union<_Tp>::value> {}; + +template <class _Tp> +struct __is_nullptr_t_impl : public false_type {}; +template <> +struct __is_nullptr_t_impl<nullptr_t> : public true_type {}; +template <class _Tp> +struct __is_nullptr_t + : public __is_nullptr_t_impl<typename remove_cv<_Tp>::type> {}; +template <class _Tp> +struct is_null_pointer + : public __is_nullptr_t_impl<typename remove_cv<_Tp>::type> {}; + +template <class _Tp> +struct is_enum + : public integral_constant< + bool, !is_void<_Tp>::value && !is_integral<_Tp>::value && + !is_floating_point<_Tp>::value && !is_array<_Tp>::value && + !is_pointer<_Tp>::value && !is_reference<_Tp>::value && + !is_member_pointer<_Tp>::value && !is_union<_Tp>::value && + !is_class<_Tp>::value && !is_function<_Tp>::value> {}; + +template <class _Tp> +struct is_scalar + : public integral_constant< + bool, is_arithmetic<_Tp>::value || is_member_pointer<_Tp>::value || + is_pointer<_Tp>::value || __is_nullptr_t<_Tp>::value || + is_enum<_Tp>::value> {}; +template <> +struct is_scalar<nullptr_t> : public true_type {}; + +} // namespace std + +#endif // STD_TYPE_TRAITS_H +)"; + +static constexpr char AbslTypeTraitsHeader[] = R"( +#ifndef ABSL_TYPE_TRAITS_H +#define ABSL_TYPE_TRAITS_H + +#include "std_type_traits.h" + +namespace absl { + +template <typename... Ts> +struct conjunction : std::true_type {}; + +template <typename T, typename... Ts> +struct conjunction<T, Ts...> + : std::conditional<T::value, conjunction<Ts...>, T>::type {}; + +template <typename T> +struct conjunction<T> : T {}; + +template <typename T> +struct negation : std::integral_constant<bool, !T::value> {}; + +template <bool B, typename T = void> +using enable_if_t = typename std::enable_if<B, T>::type; + +} // namespace absl + +#endif // ABSL_TYPE_TRAITS_H +)"; + +static constexpr char StdStringHeader[] = R"( +#ifndef STRING_H +#define STRING_H + +namespace std { + +struct string { + string(const char*); + ~string(); + bool empty(); +}; +bool operator!=(const string &LHS, const char *RHS); + +} // namespace std + +#endif // STRING_H +)"; + +static constexpr char StdUtilityHeader[] = R"( +#ifndef UTILITY_H +#define UTILITY_H + +#include "std_type_traits.h" + +namespace std { + +template <typename T> +constexpr remove_reference_t<T>&& move(T&& x); + +template <typename T> +void swap(T& a, T& b) noexcept; + +} // namespace std + +#endif // UTILITY_H +)"; + +static constexpr char StdInitializerListHeader[] = R"( +#ifndef INITIALIZER_LIST_H +#define INITIALIZER_LIST_H + +namespace std { + +template <typename T> +class initializer_list { + public: + const T *a, *b; + initializer_list() noexcept; +}; + +} // namespace std + +#endif // INITIALIZER_LIST_H +)"; + +static constexpr char StdOptionalHeader[] = R"( +#include "std_initializer_list.h" +#include "std_type_traits.h" +#include "std_utility.h" + +namespace std { + +struct in_place_t {}; +constexpr in_place_t in_place; + +struct nullopt_t { + constexpr explicit nullopt_t() {} +}; +constexpr nullopt_t nullopt; + +template <class _Tp> +struct __optional_destruct_base { + constexpr void reset() noexcept; +}; + +template <class _Tp> +struct __optional_storage_base : __optional_destruct_base<_Tp> { + constexpr bool has_value() const noexcept; +}; + +template <typename _Tp> +class optional : private __optional_storage_base<_Tp> { + using __base = __optional_storage_base<_Tp>; + + public: + using value_type = _Tp; + + private: + struct _CheckOptionalArgsConstructor { + template <class _Up> + static constexpr bool __enable_implicit() { + return is_constructible_v<_Tp, _Up&&> && is_convertible_v<_Up&&, _Tp>; + } + + template <class _Up> + static constexpr bool __enable_explicit() { + return is_constructible_v<_Tp, _Up&&> && !is_convertible_v<_Up&&, _Tp>; + } + }; + template <class _Up> + using _CheckOptionalArgsCtor = + _If<_IsNotSame<__uncvref_t<_Up>, in_place_t>::value && + _IsNotSame<__uncvref_t<_Up>, optional>::value, + _CheckOptionalArgsConstructor, __check_tuple_constructor_fail>; + template <class _QualUp> + struct _CheckOptionalLikeConstructor { + template <class _Up, class _Opt = optional<_Up>> + using __check_constructible_from_opt = + _Or<is_constructible<_Tp, _Opt&>, is_constructible<_Tp, _Opt const&>, + is_constructible<_Tp, _Opt&&>, is_constructible<_Tp, _Opt const&&>, + is_convertible<_Opt&, _Tp>, is_convertible<_Opt const&, _Tp>, + is_convertible<_Opt&&, _Tp>, is_convertible<_Opt const&&, _Tp>>; + template <class _Up, class _QUp = _QualUp> + static constexpr bool __enable_implicit() { + return is_convertible<_QUp, _Tp>::value && + !__check_constructible_from_opt<_Up>::value; + } + template <class _Up, class _QUp = _QualUp> + static constexpr bool __enable_explicit() { + return !is_convertible<_QUp, _Tp>::value && + !__check_constructible_from_opt<_Up>::value; + } + }; + + template <class _Up, class _QualUp> + using _CheckOptionalLikeCtor = + _If<_And<_IsNotSame<_Up, _Tp>, is_constructible<_Tp, _QualUp>>::value, + _CheckOptionalLikeConstructor<_QualUp>, + __check_tuple_constructor_fail>; + + + template <class _Up, class _QualUp> + using _CheckOptionalLikeAssign = _If< + _And< + _IsNotSame<_Up, _Tp>, + is_constructible<_Tp, _QualUp>, + is_assignable<_Tp&, _QualUp> + >::value, + _CheckOptionalLikeConstructor<_QualUp>, + __check_tuple_constructor_fail + >; + + public: + constexpr optional() noexcept {} + constexpr optional(const optional&) = default; + constexpr optional(optional&&) = default; + constexpr optional(nullopt_t) noexcept {} + + template < + class _InPlaceT, class... _Args, + class = enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, + is_constructible<value_type, _Args...>>::value>> + constexpr explicit optional(_InPlaceT, _Args&&... __args); + + template <class _Up, class... _Args, + class = enable_if_t<is_constructible_v< + value_type, initializer_list<_Up>&, _Args...>>> + constexpr explicit optional(in_place_t, initializer_list<_Up> __il, + _Args&&... __args); + + template < + class _Up = value_type, + enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), + int> = 0> + constexpr optional(_Up&& __v); + + template < + class _Up, + enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>(), + int> = 0> + constexpr explicit optional(_Up&& __v); + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>:: + template __enable_implicit<_Up>(), + int> = 0> + constexpr optional(const optional<_Up>& __v); + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>:: + template __enable_explicit<_Up>(), + int> = 0> + constexpr explicit optional(const optional<_Up>& __v); + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>:: + template __enable_implicit<_Up>(), + int> = 0> + constexpr optional(optional<_Up>&& __v); + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>:: + template __enable_explicit<_Up>(), + int> = 0> + constexpr explicit optional(optional<_Up>&& __v); + + constexpr optional& operator=(nullopt_t) noexcept; + + optional& operator=(const optional&); + + optional& operator=(optional&&); + + template <class _Up = value_type, + class = enable_if_t<_And<_IsNotSame<__uncvref_t<_Up>, optional>, + _Or<_IsNotSame<__uncvref_t<_Up>, value_type>, + _Not<is_scalar<value_type>>>, + is_constructible<value_type, _Up>, + is_assignable<value_type&, _Up>>::value>> + constexpr optional& operator=(_Up&& __v); + + template <class _Up, enable_if_t<_CheckOptionalLikeAssign<_Up, _Up const&>:: + template __enable_assign<_Up>(), + int> = 0> + constexpr optional& operator=(const optional<_Up>& __v); + + template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>:: + template __enable_assign<_Up>(), + int> = 0> + constexpr optional& operator=(optional<_Up>&& __v); + + const _Tp& operator*() const&; + _Tp& operator*() &; + const _Tp&& operator*() const&&; + _Tp&& operator*() &&; + + const _Tp* operator->() const; + _Tp* operator->(); + + const _Tp& value() const&; + _Tp& value() &; + const _Tp&& value() const&&; + _Tp&& value() &&; + + template <typename U> + constexpr _Tp value_or(U&& v) const&; + template <typename U> + _Tp value_or(U&& v) &&; + + template <typename... Args> + _Tp& emplace(Args&&... args); + + template <typename U, typename... Args> + _Tp& emplace(std::initializer_list<U> ilist, Args&&... args); + + using __base::reset; + + constexpr explicit operator bool() const noexcept; + using __base::has_value; + + constexpr void swap(optional& __opt) noexcept; +}; + +template <typename T> +constexpr optional<typename std::decay<T>::type> make_optional(T&& v); + +template <typename T, typename... Args> +constexpr optional<T> make_optional(Args&&... args); + +template <typename T, typename U, typename... Args> +constexpr optional<T> make_optional(std::initializer_list<U> il, + Args&&... args); + +template <typename T, typename U> +constexpr bool operator==(const optional<T> &lhs, const optional<U> &rhs); +template <typename T, typename U> +constexpr bool operator!=(const optional<T> &lhs, const optional<U> &rhs); + +template <typename T> +constexpr bool operator==(const optional<T> &opt, nullopt_t); + +// C++20 and later do not define the following overloads because they are +// provided by rewritten candidates instead. +#if __cplusplus < 202002L +template <typename T> +constexpr bool operator==(nullopt_t, const optional<T> &opt); +template <typename T> +constexpr bool operator!=(const optional<T> &opt, nullopt_t); +template <typename T> +constexpr bool operator!=(nullopt_t, const optional<T> &opt); +#endif // __cplusplus < 202002L + +template <typename T, typename U> +constexpr bool operator==(const optional<T> &opt, const U &value); +template <typename T, typename U> +constexpr bool operator==(const T &value, const optional<U> &opt); +template <typename T, typename U> +constexpr bool operator!=(const optional<T> &opt, const U &value); +template <typename T, typename U> +constexpr bool operator!=(const T &value, const optional<U> &opt); + +} // namespace std +)"; + +static constexpr char AbslOptionalHeader[] = R"( +#include "absl_type_traits.h" +#include "std_initializer_list.h" +#include "std_type_traits.h" +#include "std_utility.h" + +namespace absl { + +struct nullopt_t { + constexpr explicit nullopt_t() {} +}; +constexpr nullopt_t nullopt; + +struct in_place_t {}; +constexpr in_place_t in_place; + +template <typename T> +class optional; + +namespace optional_internal { + +template <typename T, typename U> +struct is_constructible_convertible_from_optional + : std::integral_constant< + bool, std::is_constructible<T, optional<U>&>::value || + std::is_constructible<T, optional<U>&&>::value || + std::is_constructible<T, const optional<U>&>::value || + std::is_constructible<T, const optional<U>&&>::value || + std::is_convertible<optional<U>&, T>::value || + std::is_convertible<optional<U>&&, T>::value || + std::is_convertible<const optional<U>&, T>::value || + std::is_convertible<const optional<U>&&, T>::value> {}; + +template <typename T, typename U> +struct is_constructible_convertible_assignable_from_optional + : std::integral_constant< + bool, is_constructible_convertible_from_optional<T, U>::value || + std::is_assignable<T&, optional<U>&>::value || + std::is_assignable<T&, optional<U>&&>::value || + std::is_assignable<T&, const optional<U>&>::value || + std::is_assignable<T&, const optional<U>&&>::value> {}; + +} // namespace optional_internal + +template <typename T> +class optional { + public: + constexpr optional() noexcept; + + constexpr optional(nullopt_t) noexcept; + + optional(const optional&) = default; + + optional(optional&&) = default; + + template <typename InPlaceT, typename... Args, + absl::enable_if_t<absl::conjunction< + std::is_same<InPlaceT, in_place_t>, + std::is_constructible<T, Args&&...>>::value>* = nullptr> + constexpr explicit optional(InPlaceT, Args&&... args); + + template <typename U, typename... Args, + typename = typename std::enable_if<std::is_constructible< + T, std::initializer_list<U>&, Args&&...>::value>::type> + constexpr explicit optional(in_place_t, std::initializer_list<U> il, + Args&&... args); + + template < + typename U = T, + typename std::enable_if< + absl::conjunction<absl::negation<std::is_same< + in_place_t, typename std::decay<U>::type>>, + absl::negation<std::is_same< + optional<T>, typename std::decay<U>::type>>, + std::is_convertible<U&&, T>, + std::is_constructible<T, U&&>>::value, + bool>::type = false> + constexpr optional(U&& v); + + template < + typename U = T, + typename std::enable_if< + absl::conjunction<absl::negation<std::is_same< + in_place_t, typename std::decay<U>::type>>, + absl::negation<std::is_same< + optional<T>, typename std::decay<U>::type>>, + absl::negation<std::is_convertible<U&&, T>>, + std::is_constructible<T, U&&>>::value, + bool>::type = false> + explicit constexpr optional(U&& v); + + template <typename U, + typename std::enable_if< + absl::conjunction< + absl::negation<std::is_same<T, U>>, + std::is_constructible<T, const U&>, + absl::negation< + optional_internal:: + is_constructible_convertible_from_optional<T, U>>, + std::is_convertible<const U&, T>>::value, + bool>::type = false> + optional(const optional<U>& rhs); + + template <typename U, + typename std::enable_if< + absl::conjunction< + absl::negation<std::is_same<T, U>>, + std::is_constructible<T, const U&>, + absl::negation< + optional_internal:: + is_constructible_convertible_from_optional<T, U>>, + absl::negation<std::is_convertible<const U&, T>>>::value, + bool>::type = false> + explicit optional(const optional<U>& rhs); + + template < + typename U, + typename std::enable_if< + absl::conjunction< + absl::negation<std::is_same<T, U>>, std::is_constructible<T, U&&>, + absl::negation< + optional_internal::is_constructible_convertible_from_optional< + T, U>>, + std::is_convertible<U&&, T>>::value, + bool>::type = false> + optional(optional<U>&& rhs); + + template < + typename U, + typename std::enable_if< + absl::conjunction< + absl::negation<std::is_same<T, U>>, std::is_constructible<T, U&&>, + absl::negation< + optional_internal::is_constructible_convertible_from_optional< + T, U>>, + absl::negation<std::is_convertible<U&&, T>>>::value, + bool>::type = false> + explicit optional(optional<U>&& rhs); + + optional& operator=(nullopt_t) noexcept; + + optional& operator=(const optional& src); + + optional& operator=(optional&& src); + + template < + typename U = T, + typename = typename std::enable_if<absl::conjunction< + absl::negation< + std::is_same<optional<T>, typename std::decay<U>::type>>, + absl::negation< + absl::conjunction<std::is_scalar<T>, + std::is_same<T, typename std::decay<U>::type>>>, + std::is_constructible<T, U>, std::is_assignable<T&, U>>::value>::type> + optional& operator=(U&& v); + + template < + typename U, + typename = typename std::enable_if<absl::conjunction< + absl::negation<std::is_same<T, U>>, + std::is_constructible<T, const U&>, std::is_assignable<T&, const U&>, + absl::negation< + optional_internal:: + is_constructible_convertible_assignable_from_optional< + T, U>>>::value>::type> + optional& operator=(const optional<U>& rhs); + + template <typename U, + typename = typename std::enable_if<absl::conjunction< + absl::negation<std::is_same<T, U>>, std::is_constructible<T, U>, + std::is_assignable<T&, U>, + absl::negation< + optional_internal:: + is_constructible_convertible_assignable_from_optional< + T, U>>>::value>::type> + optional& operator=(optional<U>&& rhs); + + const T& operator*() const&; + T& operator*() &; + const T&& operator*() const&&; + T&& operator*() &&; + + const T* operator->() const; + T* operator->(); + + const T& value() const&; + T& value() &; + const T&& value() const&&; + T&& value() &&; + + template <typename U> + constexpr T value_or(U&& v) const&; + template <typename U> + T value_or(U&& v) &&; + + template <typename... Args> + T& emplace(Args&&... args); + + template <typename U, typename... Args> + T& emplace(std::initializer_list<U> ilist, Args&&... args); + + void reset() noexcept; + + constexpr explicit operator bool() const noexcept; + constexpr bool has_value() const noexcept; + + void swap(optional& rhs) noexcept; +}; + +template <typename T> +constexpr optional<typename std::decay<T>::type> make_optional(T&& v); + +template <typename T, typename... Args> +constexpr optional<T> make_optional(Args&&... args); + +template <typename T, typename U, typename... Args> +constexpr optional<T> make_optional(std::initializer_list<U> il, + Args&&... args); + +template <typename T, typename U> +constexpr bool operator==(const optional<T> &lhs, const optional<U> &rhs); +template <typename T, typename U> +constexpr bool operator!=(const optional<T> &lhs, const optional<U> &rhs); + +template <typename T> +constexpr bool operator==(const optional<T> &opt, nullopt_t); +template <typename T> +constexpr bool operator==(nullopt_t, const optional<T> &opt); +template <typename T> +constexpr bool operator!=(const optional<T> &opt, nullopt_t); +template <typename T> +constexpr bool operator!=(nullopt_t, const optional<T> &opt); + +template <typename T, typename U> +constexpr bool operator==(const optional<T> &opt, const U &value); +template <typename T, typename U> +constexpr bool operator==(const T &value, const optional<U> &opt); +template <typename T, typename U> +constexpr bool operator!=(const optional<T> &opt, const U &value); +template <typename T, typename U> +constexpr bool operator!=(const T &value, const optional<U> &opt); + +} // namespace absl +)"; + +static constexpr char BaseOptionalHeader[] = R"( +#include "std_initializer_list.h" +#include "std_type_traits.h" +#include "std_utility.h" + +namespace base { + +struct in_place_t {}; +constexpr in_place_t in_place; + +struct nullopt_t { + constexpr explicit nullopt_t() {} +}; +constexpr nullopt_t nullopt; + +template <typename T> +class Optional; + +namespace internal { + +template <typename T> +using RemoveCvRefT = std::remove_cv_t<std::remove_reference_t<T>>; + +template <typename T, typename U> +struct IsConvertibleFromOptional + : std::integral_constant< + bool, std::is_constructible<T, Optional<U>&>::value || + std::is_constructible<T, const Optional<U>&>::value || + std::is_constructible<T, Optional<U>&&>::value || + std::is_constructible<T, const Optional<U>&&>::value || + std::is_convertible<Optional<U>&, T>::value || + std::is_convertible<const Optional<U>&, T>::value || + std::is_convertible<Optional<U>&&, T>::value || + std::is_convertible<const Optional<U>&&, T>::value> {}; + +template <typename T, typename U> +struct IsAssignableFromOptional + : std::integral_constant< + bool, IsConvertibleFromOptional<T, U>::value || + std::is_assignable<T&, Optional<U>&>::value || + std::is_assignable<T&, const Optional<U>&>::value || + std::is_assignable<T&, Optional<U>&&>::value || + std::is_assignable<T&, const Optional<U>&&>::value> {}; + +} // namespace internal + +template <typename T> +class Optional { + public: + using value_type = T; + + constexpr Optional() = default; + constexpr Optional(const Optional& other) noexcept = default; + constexpr Optional(Optional&& other) noexcept = default; + + constexpr Optional(nullopt_t); + + template <typename U, + typename std::enable_if< + std::is_constructible<T, const U&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + std::is_convertible<const U&, T>::value, + bool>::type = false> + Optional(const Optional<U>& other) noexcept; + + template <typename U, + typename std::enable_if< + std::is_constructible<T, const U&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + !std::is_convertible<const U&, T>::value, + bool>::type = false> + explicit Optional(const Optional<U>& other) noexcept; + + template <typename U, + typename std::enable_if< + std::is_constructible<T, U&&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + std::is_convertible<U&&, T>::value, + bool>::type = false> + Optional(Optional<U>&& other) noexcept; + + template <typename U, + typename std::enable_if< + std::is_constructible<T, U&&>::value && + !internal::IsConvertibleFromOptional<T, U>::value && + !std::is_convertible<U&&, T>::value, + bool>::type = false> + explicit Optional(Optional<U>&& other) noexcept; + + template <class... Args> + constexpr explicit Optional(in_place_t, Args&&... args); + + template <class U, class... Args, + class = typename std::enable_if<std::is_constructible< + value_type, std::initializer_list<U>&, Args...>::value>::type> + constexpr explicit Optional(in_place_t, std::initializer_list<U> il, + Args&&... args); + + template < + typename U = value_type, + typename std::enable_if< + std::is_constructible<T, U&&>::value && + !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value && + !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && + std::is_convertible<U&&, T>::value, + bool>::type = false> + constexpr Optional(U&& value); + + template < + typename U = value_type, + typename std::enable_if< + std::is_constructible<T, U&&>::value && + !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value && + !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && + !std::is_convertible<U&&, T>::value, + bool>::type = false> + constexpr explicit Optional(U&& value); + + Optional& operator=(const Optional& other) noexcept; + + Optional& operator=(Optional&& other) noexcept; + + Optional& operator=(nullopt_t); + + template <typename U> + typename std::enable_if< + !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && + std::is_constructible<T, U>::value && + std::is_assignable<T&, U>::value && + (!std::is_scalar<T>::value || + !std::is_same<typename std::decay<U>::type, T>::value), + Optional&>::type + operator=(U&& value) noexcept; + + template <typename U> + typename std::enable_if<!internal::IsAssignableFromOptional<T, U>::value && + std::is_constructible<T, const U&>::value && + std::is_assignable<T&, const U&>::value, + Optional&>::type + operator=(const Optional<U>& other) noexcept; + + template <typename U> + typename std::enable_if<!internal::IsAssignableFromOptional<T, U>::value && + std::is_constructible<T, U>::value && + std::is_assignable<T&, U>::value, + Optional&>::type + operator=(Optional<U>&& other) noexcept; + + const T& operator*() const&; + T& operator*() &; + const T&& operator*() const&&; + T&& operator*() &&; + + const T* operator->() const; + T* operator->(); + + const T& value() const&; + T& value() &; + const T&& value() const&&; + T&& value() &&; + + template <typename U> + constexpr T value_or(U&& v) const&; + template <typename U> + T value_or(U&& v) &&; + + template <typename... Args> + T& emplace(Args&&... args); + + template <typename U, typename... Args> + T& emplace(std::initializer_list<U> ilist, Args&&... args); + + void reset() noexcept; + + constexpr explicit operator bool() const noexcept; + constexpr bool has_value() const noexcept; + + void swap(Optional& other); +}; + +template <typename T> +constexpr Optional<typename std::decay<T>::type> make_optional(T&& v); + +template <typename T, typename... Args> +constexpr Optional<T> make_optional(Args&&... args); + +template <typename T, typename U, typename... Args> +constexpr Optional<T> make_optional(std::initializer_list<U> il, + Args&&... args); + +template <typename T, typename U> +constexpr bool operator==(const Optional<T> &lhs, const Optional<U> &rhs); +template <typename T, typename U> +constexpr bool operator!=(const Optional<T> &lhs, const Optional<U> &rhs); + +template <typename T> +constexpr bool operator==(const Optional<T> &opt, nullopt_t); +template <typename T> +constexpr bool operator==(nullopt_t, const Optional<T> &opt); +template <typename T> +constexpr bool operator!=(const Optional<T> &opt, nullopt_t); +template <typename T> +constexpr bool operator!=(nullopt_t, const Optional<T> &opt); + +template <typename T, typename U> +constexpr bool operator==(const Optional<T> &opt, const U &value); +template <typename T, typename U> +constexpr bool operator==(const T &value, const Optional<U> &opt); +template <typename T, typename U> +constexpr bool operator!=(const Optional<T> &opt, const U &value); +template <typename T, typename U> +constexpr bool operator!=(const T &value, const Optional<U> &opt); + +} // namespace base +)"; + +std::vector<std::pair<std::string, std::string>> getMockHeaders() { + std::vector<std::pair<std::string, std::string>> Headers; + Headers.emplace_back("cstddef.h", CStdDefHeader); + Headers.emplace_back("std_initializer_list.h", StdInitializerListHeader); + Headers.emplace_back("std_string.h", StdStringHeader); + Headers.emplace_back("std_type_traits.h", StdTypeTraitsHeader); + Headers.emplace_back("std_utility.h", StdUtilityHeader); + Headers.emplace_back("std_optional.h", StdOptionalHeader); + Headers.emplace_back("absl_type_traits.h", AbslTypeTraitsHeader); + Headers.emplace_back("absl_optional.h", AbslOptionalHeader); + Headers.emplace_back("base_optional.h", BaseOptionalHeader); + return Headers; +} + +} // namespace test +} // namespace dataflow +} // namespace clang
\ No newline at end of file diff --git a/clang/unittests/Analysis/FlowSensitive/MockHeaders.h b/clang/unittests/Analysis/FlowSensitive/MockHeaders.h new file mode 100644 index 0000000..c0b544f --- /dev/null +++ b/clang/unittests/Analysis/FlowSensitive/MockHeaders.h @@ -0,0 +1,30 @@ +//===--- MockHeaders.h - Mock headers for dataflow analyses -*- C++ -----*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines mock headers for testing of dataflow analyses. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_MOCK_HEADERS_H_ +#define LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_MOCK_HEADERS_H_ + +#include <string> +#include <utility> +#include <vector> + +namespace clang { +namespace dataflow { +namespace test { + +std::vector<std::pair<std::string, std::string>> getMockHeaders(); + +} // namespace test +} // namespace dataflow +} // namespace clang + +#endif // LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_MOCK_HEADERS_H_ diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp index 1dd0783..ba509e8 100644 --- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp @@ -8,6 +8,7 @@ // FIXME: Move this to clang/unittests/Analysis/FlowSensitive/Models. #include "clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h" +#include "MockHeaders.h" #include "TestingSupport.h" #include "clang/AST/ASTContext.h" #include "clang/ASTMatchers/ASTMatchers.h" @@ -31,1232 +32,6 @@ using namespace test; using ::testing::ContainerEq; -// FIXME: Move header definitions in separate file(s). -static constexpr char CSDtdDefHeader[] = R"( -#ifndef CSTDDEF_H -#define CSTDDEF_H - -namespace std { - -typedef decltype(sizeof(char)) size_t; - -using nullptr_t = decltype(nullptr); - -} // namespace std - -#endif // CSTDDEF_H -)"; - -static constexpr char StdTypeTraitsHeader[] = R"( -#ifndef STD_TYPE_TRAITS_H -#define STD_TYPE_TRAITS_H - -#include "cstddef.h" - -namespace std { - -template <typename T, T V> -struct integral_constant { - static constexpr T value = V; -}; - -using true_type = integral_constant<bool, true>; -using false_type = integral_constant<bool, false>; - -template< class T > struct remove_reference {typedef T type;}; -template< class T > struct remove_reference<T&> {typedef T type;}; -template< class T > struct remove_reference<T&&> {typedef T type;}; - -template <class T> - using remove_reference_t = typename remove_reference<T>::type; - -template <class T> -struct remove_extent { - typedef T type; -}; - -template <class T> -struct remove_extent<T[]> { - typedef T type; -}; - -template <class T, size_t N> -struct remove_extent<T[N]> { - typedef T type; -}; - -template <class T> -struct is_array : false_type {}; - -template <class T> -struct is_array<T[]> : true_type {}; - -template <class T, size_t N> -struct is_array<T[N]> : true_type {}; - -template <class> -struct is_function : false_type {}; - -template <class Ret, class... Args> -struct is_function<Ret(Args...)> : true_type {}; - -namespace detail { - -template <class T> -struct type_identity { - using type = T; -}; // or use type_identity (since C++20) - -template <class T> -auto try_add_pointer(int) -> type_identity<typename remove_reference<T>::type*>; -template <class T> -auto try_add_pointer(...) -> type_identity<T>; - -} // namespace detail - -template <class T> -struct add_pointer : decltype(detail::try_add_pointer<T>(0)) {}; - -template <bool B, class T, class F> -struct conditional { - typedef T type; -}; - -template <class T, class F> -struct conditional<false, T, F> { - typedef F type; -}; - -template <class T> -struct remove_cv { - typedef T type; -}; -template <class T> -struct remove_cv<const T> { - typedef T type; -}; -template <class T> -struct remove_cv<volatile T> { - typedef T type; -}; -template <class T> -struct remove_cv<const volatile T> { - typedef T type; -}; - -template <class T> -using remove_cv_t = typename remove_cv<T>::type; - -template <class T> -struct decay { - private: - typedef typename remove_reference<T>::type U; - - public: - typedef typename conditional< - is_array<U>::value, typename remove_extent<U>::type*, - typename conditional<is_function<U>::value, typename add_pointer<U>::type, - typename remove_cv<U>::type>::type>::type type; -}; - -template <bool B, class T = void> -struct enable_if {}; - -template <class T> -struct enable_if<true, T> { - typedef T type; -}; - -template <bool B, class T = void> -using enable_if_t = typename enable_if<B, T>::type; - -template <class T, class U> -struct is_same : false_type {}; - -template <class T> -struct is_same<T, T> : true_type {}; - -template <class T> -struct is_void : is_same<void, typename remove_cv<T>::type> {}; - -namespace detail { - -template <class T> -auto try_add_lvalue_reference(int) -> type_identity<T&>; -template <class T> -auto try_add_lvalue_reference(...) -> type_identity<T>; - -template <class T> -auto try_add_rvalue_reference(int) -> type_identity<T&&>; -template <class T> -auto try_add_rvalue_reference(...) -> type_identity<T>; - -} // namespace detail - -template <class T> -struct add_lvalue_reference : decltype(detail::try_add_lvalue_reference<T>(0)) { -}; - -template <class T> -struct add_rvalue_reference : decltype(detail::try_add_rvalue_reference<T>(0)) { -}; - -template <class T> -typename add_rvalue_reference<T>::type declval() noexcept; - -namespace detail { - -template <class T> -auto test_returnable(int) - -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{}); -template <class> -auto test_returnable(...) -> false_type; - -template <class From, class To> -auto test_implicitly_convertible(int) - -> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{}); -template <class, class> -auto test_implicitly_convertible(...) -> false_type; - -} // namespace detail - -template <class From, class To> -struct is_convertible - : integral_constant<bool, - (decltype(detail::test_returnable<To>(0))::value && - decltype(detail::test_implicitly_convertible<From, To>( - 0))::value) || - (is_void<From>::value && is_void<To>::value)> {}; - -template <class From, class To> -inline constexpr bool is_convertible_v = is_convertible<From, To>::value; - -template <class...> -using void_t = void; - -template <class, class T, class... Args> -struct is_constructible_ : false_type {}; - -template <class T, class... Args> -struct is_constructible_<void_t<decltype(T(declval<Args>()...))>, T, Args...> - : true_type {}; - -template <class T, class... Args> -using is_constructible = is_constructible_<void_t<>, T, Args...>; - -template <class T, class... Args> -inline constexpr bool is_constructible_v = is_constructible<T, Args...>::value; - -template <class _Tp> -struct __uncvref { - typedef typename remove_cv<typename remove_reference<_Tp>::type>::type type; -}; - -template <class _Tp> -using __uncvref_t = typename __uncvref<_Tp>::type; - -template <bool _Val> -using _BoolConstant = integral_constant<bool, _Val>; - -template <class _Tp, class _Up> -using _IsSame = _BoolConstant<__is_same(_Tp, _Up)>; - -template <class _Tp, class _Up> -using _IsNotSame = _BoolConstant<!__is_same(_Tp, _Up)>; - -template <bool> -struct _MetaBase; -template <> -struct _MetaBase<true> { - template <class _Tp, class _Up> - using _SelectImpl = _Tp; - template <template <class...> class _FirstFn, template <class...> class, - class... _Args> - using _SelectApplyImpl = _FirstFn<_Args...>; - template <class _First, class...> - using _FirstImpl = _First; - template <class, class _Second, class...> - using _SecondImpl = _Second; - template <class _Result, class _First, class... _Rest> - using _OrImpl = - typename _MetaBase<_First::value != true && sizeof...(_Rest) != 0>:: - template _OrImpl<_First, _Rest...>; -}; - -template <> -struct _MetaBase<false> { - template <class _Tp, class _Up> - using _SelectImpl = _Up; - template <template <class...> class, template <class...> class _SecondFn, - class... _Args> - using _SelectApplyImpl = _SecondFn<_Args...>; - template <class _Result, class...> - using _OrImpl = _Result; -}; - -template <bool _Cond, class _IfRes, class _ElseRes> -using _If = typename _MetaBase<_Cond>::template _SelectImpl<_IfRes, _ElseRes>; - -template <class... _Rest> -using _Or = typename _MetaBase<sizeof...(_Rest) != - 0>::template _OrImpl<false_type, _Rest...>; - -template <bool _Bp, class _Tp = void> -using __enable_if_t = typename enable_if<_Bp, _Tp>::type; - -template <class...> -using __expand_to_true = true_type; -template <class... _Pred> -__expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int); -template <class...> -false_type __and_helper(...); -template <class... _Pred> -using _And = decltype(__and_helper<_Pred...>(0)); - -template <class _Pred> -struct _Not : _BoolConstant<!_Pred::value> {}; - -struct __check_tuple_constructor_fail { - static constexpr bool __enable_explicit_default() { return false; } - static constexpr bool __enable_implicit_default() { return false; } - template <class...> - static constexpr bool __enable_explicit() { - return false; - } - template <class...> - static constexpr bool __enable_implicit() { - return false; - } -}; - -template <typename, typename _Tp> -struct __select_2nd { - typedef _Tp type; -}; -template <class _Tp, class _Arg> -typename __select_2nd<decltype((declval<_Tp>() = declval<_Arg>())), - true_type>::type -__is_assignable_test(int); -template <class, class> -false_type __is_assignable_test(...); -template <class _Tp, class _Arg, - bool = is_void<_Tp>::value || is_void<_Arg>::value> -struct __is_assignable_imp - : public decltype((__is_assignable_test<_Tp, _Arg>(0))) {}; -template <class _Tp, class _Arg> -struct __is_assignable_imp<_Tp, _Arg, true> : public false_type {}; -template <class _Tp, class _Arg> -struct is_assignable : public __is_assignable_imp<_Tp, _Arg> {}; - -template <class _Tp> -struct __libcpp_is_integral : public false_type {}; -template <> -struct __libcpp_is_integral<bool> : public true_type {}; -template <> -struct __libcpp_is_integral<char> : public true_type {}; -template <> -struct __libcpp_is_integral<signed char> : public true_type {}; -template <> -struct __libcpp_is_integral<unsigned char> : public true_type {}; -template <> -struct __libcpp_is_integral<wchar_t> : public true_type {}; -template <> -struct __libcpp_is_integral<short> : public true_type {}; // NOLINT -template <> -struct __libcpp_is_integral<unsigned short> : public true_type {}; // NOLINT -template <> -struct __libcpp_is_integral<int> : public true_type {}; -template <> -struct __libcpp_is_integral<unsigned int> : public true_type {}; -template <> -struct __libcpp_is_integral<long> : public true_type {}; // NOLINT -template <> -struct __libcpp_is_integral<unsigned long> : public true_type {}; // NOLINT -template <> -struct __libcpp_is_integral<long long> : public true_type {}; // NOLINT -template <> // NOLINTNEXTLINE -struct __libcpp_is_integral<unsigned long long> : public true_type {}; -template <class _Tp> -struct is_integral - : public __libcpp_is_integral<typename remove_cv<_Tp>::type> {}; - -template <class _Tp> -struct __libcpp_is_floating_point : public false_type {}; -template <> -struct __libcpp_is_floating_point<float> : public true_type {}; -template <> -struct __libcpp_is_floating_point<double> : public true_type {}; -template <> -struct __libcpp_is_floating_point<long double> : public true_type {}; -template <class _Tp> -struct is_floating_point - : public __libcpp_is_floating_point<typename remove_cv<_Tp>::type> {}; - -template <class _Tp> -struct is_arithmetic - : public integral_constant<bool, is_integral<_Tp>::value || - is_floating_point<_Tp>::value> {}; - -template <class _Tp> -struct __libcpp_is_pointer : public false_type {}; -template <class _Tp> -struct __libcpp_is_pointer<_Tp*> : public true_type {}; -template <class _Tp> -struct is_pointer : public __libcpp_is_pointer<typename remove_cv<_Tp>::type> { -}; - -template <class _Tp> -struct __libcpp_is_member_pointer : public false_type {}; -template <class _Tp, class _Up> -struct __libcpp_is_member_pointer<_Tp _Up::*> : public true_type {}; -template <class _Tp> -struct is_member_pointer - : public __libcpp_is_member_pointer<typename remove_cv<_Tp>::type> {}; - -template <class _Tp> -struct __libcpp_union : public false_type {}; -template <class _Tp> -struct is_union : public __libcpp_union<typename remove_cv<_Tp>::type> {}; - -template <class T> -struct is_reference : false_type {}; -template <class T> -struct is_reference<T&> : true_type {}; -template <class T> -struct is_reference<T&&> : true_type {}; - -template <class T> -inline constexpr bool is_reference_v = is_reference<T>::value; - -struct __two { - char __lx[2]; -}; - -namespace __is_class_imp { -template <class _Tp> -char __test(int _Tp::*); -template <class _Tp> -__two __test(...); -} // namespace __is_class_imp -template <class _Tp> -struct is_class - : public integral_constant<bool, - sizeof(__is_class_imp::__test<_Tp>(0)) == 1 && - !is_union<_Tp>::value> {}; - -template <class _Tp> -struct __is_nullptr_t_impl : public false_type {}; -template <> -struct __is_nullptr_t_impl<nullptr_t> : public true_type {}; -template <class _Tp> -struct __is_nullptr_t - : public __is_nullptr_t_impl<typename remove_cv<_Tp>::type> {}; -template <class _Tp> -struct is_null_pointer - : public __is_nullptr_t_impl<typename remove_cv<_Tp>::type> {}; - -template <class _Tp> -struct is_enum - : public integral_constant< - bool, !is_void<_Tp>::value && !is_integral<_Tp>::value && - !is_floating_point<_Tp>::value && !is_array<_Tp>::value && - !is_pointer<_Tp>::value && !is_reference<_Tp>::value && - !is_member_pointer<_Tp>::value && !is_union<_Tp>::value && - !is_class<_Tp>::value && !is_function<_Tp>::value> {}; - -template <class _Tp> -struct is_scalar - : public integral_constant< - bool, is_arithmetic<_Tp>::value || is_member_pointer<_Tp>::value || - is_pointer<_Tp>::value || __is_nullptr_t<_Tp>::value || - is_enum<_Tp>::value> {}; -template <> -struct is_scalar<nullptr_t> : public true_type {}; - -} // namespace std - -#endif // STD_TYPE_TRAITS_H -)"; - -static constexpr char AbslTypeTraitsHeader[] = R"( -#ifndef ABSL_TYPE_TRAITS_H -#define ABSL_TYPE_TRAITS_H - -#include "std_type_traits.h" - -namespace absl { - -template <typename... Ts> -struct conjunction : std::true_type {}; - -template <typename T, typename... Ts> -struct conjunction<T, Ts...> - : std::conditional<T::value, conjunction<Ts...>, T>::type {}; - -template <typename T> -struct conjunction<T> : T {}; - -template <typename T> -struct negation : std::integral_constant<bool, !T::value> {}; - -template <bool B, typename T = void> -using enable_if_t = typename std::enable_if<B, T>::type; - -} // namespace absl - -#endif // ABSL_TYPE_TRAITS_H -)"; - -static constexpr char StdStringHeader[] = R"( -#ifndef STRING_H -#define STRING_H - -namespace std { - -struct string { - string(const char*); - ~string(); - bool empty(); -}; -bool operator!=(const string &LHS, const char *RHS); - -} // namespace std - -#endif // STRING_H -)"; - -static constexpr char StdUtilityHeader[] = R"( -#ifndef UTILITY_H -#define UTILITY_H - -#include "std_type_traits.h" - -namespace std { - -template <typename T> -constexpr remove_reference_t<T>&& move(T&& x); - -template <typename T> -void swap(T& a, T& b) noexcept; - -} // namespace std - -#endif // UTILITY_H -)"; - -static constexpr char StdInitializerListHeader[] = R"( -#ifndef INITIALIZER_LIST_H -#define INITIALIZER_LIST_H - -namespace std { - -template <typename T> -class initializer_list { - public: - const T *a, *b; - initializer_list() noexcept; -}; - -} // namespace std - -#endif // INITIALIZER_LIST_H -)"; - -static constexpr char StdOptionalHeader[] = R"( -#include "std_initializer_list.h" -#include "std_type_traits.h" -#include "std_utility.h" - -namespace std { - -struct in_place_t {}; -constexpr in_place_t in_place; - -struct nullopt_t { - constexpr explicit nullopt_t() {} -}; -constexpr nullopt_t nullopt; - -template <class _Tp> -struct __optional_destruct_base { - constexpr void reset() noexcept; -}; - -template <class _Tp> -struct __optional_storage_base : __optional_destruct_base<_Tp> { - constexpr bool has_value() const noexcept; -}; - -template <typename _Tp> -class optional : private __optional_storage_base<_Tp> { - using __base = __optional_storage_base<_Tp>; - - public: - using value_type = _Tp; - - private: - struct _CheckOptionalArgsConstructor { - template <class _Up> - static constexpr bool __enable_implicit() { - return is_constructible_v<_Tp, _Up&&> && is_convertible_v<_Up&&, _Tp>; - } - - template <class _Up> - static constexpr bool __enable_explicit() { - return is_constructible_v<_Tp, _Up&&> && !is_convertible_v<_Up&&, _Tp>; - } - }; - template <class _Up> - using _CheckOptionalArgsCtor = - _If<_IsNotSame<__uncvref_t<_Up>, in_place_t>::value && - _IsNotSame<__uncvref_t<_Up>, optional>::value, - _CheckOptionalArgsConstructor, __check_tuple_constructor_fail>; - template <class _QualUp> - struct _CheckOptionalLikeConstructor { - template <class _Up, class _Opt = optional<_Up>> - using __check_constructible_from_opt = - _Or<is_constructible<_Tp, _Opt&>, is_constructible<_Tp, _Opt const&>, - is_constructible<_Tp, _Opt&&>, is_constructible<_Tp, _Opt const&&>, - is_convertible<_Opt&, _Tp>, is_convertible<_Opt const&, _Tp>, - is_convertible<_Opt&&, _Tp>, is_convertible<_Opt const&&, _Tp>>; - template <class _Up, class _QUp = _QualUp> - static constexpr bool __enable_implicit() { - return is_convertible<_QUp, _Tp>::value && - !__check_constructible_from_opt<_Up>::value; - } - template <class _Up, class _QUp = _QualUp> - static constexpr bool __enable_explicit() { - return !is_convertible<_QUp, _Tp>::value && - !__check_constructible_from_opt<_Up>::value; - } - }; - - template <class _Up, class _QualUp> - using _CheckOptionalLikeCtor = - _If<_And<_IsNotSame<_Up, _Tp>, is_constructible<_Tp, _QualUp>>::value, - _CheckOptionalLikeConstructor<_QualUp>, - __check_tuple_constructor_fail>; - - - template <class _Up, class _QualUp> - using _CheckOptionalLikeAssign = _If< - _And< - _IsNotSame<_Up, _Tp>, - is_constructible<_Tp, _QualUp>, - is_assignable<_Tp&, _QualUp> - >::value, - _CheckOptionalLikeConstructor<_QualUp>, - __check_tuple_constructor_fail - >; - - public: - constexpr optional() noexcept {} - constexpr optional(const optional&) = default; - constexpr optional(optional&&) = default; - constexpr optional(nullopt_t) noexcept {} - - template < - class _InPlaceT, class... _Args, - class = enable_if_t<_And<_IsSame<_InPlaceT, in_place_t>, - is_constructible<value_type, _Args...>>::value>> - constexpr explicit optional(_InPlaceT, _Args&&... __args); - - template <class _Up, class... _Args, - class = enable_if_t<is_constructible_v< - value_type, initializer_list<_Up>&, _Args...>>> - constexpr explicit optional(in_place_t, initializer_list<_Up> __il, - _Args&&... __args); - - template < - class _Up = value_type, - enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>(), - int> = 0> - constexpr optional(_Up&& __v); - - template < - class _Up, - enable_if_t<_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>(), - int> = 0> - constexpr explicit optional(_Up&& __v); - - template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>:: - template __enable_implicit<_Up>(), - int> = 0> - constexpr optional(const optional<_Up>& __v); - - template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up const&>:: - template __enable_explicit<_Up>(), - int> = 0> - constexpr explicit optional(const optional<_Up>& __v); - - template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>:: - template __enable_implicit<_Up>(), - int> = 0> - constexpr optional(optional<_Up>&& __v); - - template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>:: - template __enable_explicit<_Up>(), - int> = 0> - constexpr explicit optional(optional<_Up>&& __v); - - constexpr optional& operator=(nullopt_t) noexcept; - - optional& operator=(const optional&); - - optional& operator=(optional&&); - - template <class _Up = value_type, - class = enable_if_t<_And<_IsNotSame<__uncvref_t<_Up>, optional>, - _Or<_IsNotSame<__uncvref_t<_Up>, value_type>, - _Not<is_scalar<value_type>>>, - is_constructible<value_type, _Up>, - is_assignable<value_type&, _Up>>::value>> - constexpr optional& operator=(_Up&& __v); - - template <class _Up, enable_if_t<_CheckOptionalLikeAssign<_Up, _Up const&>:: - template __enable_assign<_Up>(), - int> = 0> - constexpr optional& operator=(const optional<_Up>& __v); - - template <class _Up, enable_if_t<_CheckOptionalLikeCtor<_Up, _Up&&>:: - template __enable_assign<_Up>(), - int> = 0> - constexpr optional& operator=(optional<_Up>&& __v); - - const _Tp& operator*() const&; - _Tp& operator*() &; - const _Tp&& operator*() const&&; - _Tp&& operator*() &&; - - const _Tp* operator->() const; - _Tp* operator->(); - - const _Tp& value() const&; - _Tp& value() &; - const _Tp&& value() const&&; - _Tp&& value() &&; - - template <typename U> - constexpr _Tp value_or(U&& v) const&; - template <typename U> - _Tp value_or(U&& v) &&; - - template <typename... Args> - _Tp& emplace(Args&&... args); - - template <typename U, typename... Args> - _Tp& emplace(std::initializer_list<U> ilist, Args&&... args); - - using __base::reset; - - constexpr explicit operator bool() const noexcept; - using __base::has_value; - - constexpr void swap(optional& __opt) noexcept; -}; - -template <typename T> -constexpr optional<typename std::decay<T>::type> make_optional(T&& v); - -template <typename T, typename... Args> -constexpr optional<T> make_optional(Args&&... args); - -template <typename T, typename U, typename... Args> -constexpr optional<T> make_optional(std::initializer_list<U> il, - Args&&... args); - -template <typename T, typename U> -constexpr bool operator==(const optional<T> &lhs, const optional<U> &rhs); -template <typename T, typename U> -constexpr bool operator!=(const optional<T> &lhs, const optional<U> &rhs); - -template <typename T> -constexpr bool operator==(const optional<T> &opt, nullopt_t); - -// C++20 and later do not define the following overloads because they are -// provided by rewritten candidates instead. -#if __cplusplus < 202002L -template <typename T> -constexpr bool operator==(nullopt_t, const optional<T> &opt); -template <typename T> -constexpr bool operator!=(const optional<T> &opt, nullopt_t); -template <typename T> -constexpr bool operator!=(nullopt_t, const optional<T> &opt); -#endif // __cplusplus < 202002L - -template <typename T, typename U> -constexpr bool operator==(const optional<T> &opt, const U &value); -template <typename T, typename U> -constexpr bool operator==(const T &value, const optional<U> &opt); -template <typename T, typename U> -constexpr bool operator!=(const optional<T> &opt, const U &value); -template <typename T, typename U> -constexpr bool operator!=(const T &value, const optional<U> &opt); - -} // namespace std -)"; - -static constexpr char AbslOptionalHeader[] = R"( -#include "absl_type_traits.h" -#include "std_initializer_list.h" -#include "std_type_traits.h" -#include "std_utility.h" - -namespace absl { - -struct nullopt_t { - constexpr explicit nullopt_t() {} -}; -constexpr nullopt_t nullopt; - -struct in_place_t {}; -constexpr in_place_t in_place; - -template <typename T> -class optional; - -namespace optional_internal { - -template <typename T, typename U> -struct is_constructible_convertible_from_optional - : std::integral_constant< - bool, std::is_constructible<T, optional<U>&>::value || - std::is_constructible<T, optional<U>&&>::value || - std::is_constructible<T, const optional<U>&>::value || - std::is_constructible<T, const optional<U>&&>::value || - std::is_convertible<optional<U>&, T>::value || - std::is_convertible<optional<U>&&, T>::value || - std::is_convertible<const optional<U>&, T>::value || - std::is_convertible<const optional<U>&&, T>::value> {}; - -template <typename T, typename U> -struct is_constructible_convertible_assignable_from_optional - : std::integral_constant< - bool, is_constructible_convertible_from_optional<T, U>::value || - std::is_assignable<T&, optional<U>&>::value || - std::is_assignable<T&, optional<U>&&>::value || - std::is_assignable<T&, const optional<U>&>::value || - std::is_assignable<T&, const optional<U>&&>::value> {}; - -} // namespace optional_internal - -template <typename T> -class optional { - public: - constexpr optional() noexcept; - - constexpr optional(nullopt_t) noexcept; - - optional(const optional&) = default; - - optional(optional&&) = default; - - template <typename InPlaceT, typename... Args, - absl::enable_if_t<absl::conjunction< - std::is_same<InPlaceT, in_place_t>, - std::is_constructible<T, Args&&...>>::value>* = nullptr> - constexpr explicit optional(InPlaceT, Args&&... args); - - template <typename U, typename... Args, - typename = typename std::enable_if<std::is_constructible< - T, std::initializer_list<U>&, Args&&...>::value>::type> - constexpr explicit optional(in_place_t, std::initializer_list<U> il, - Args&&... args); - - template < - typename U = T, - typename std::enable_if< - absl::conjunction<absl::negation<std::is_same< - in_place_t, typename std::decay<U>::type>>, - absl::negation<std::is_same< - optional<T>, typename std::decay<U>::type>>, - std::is_convertible<U&&, T>, - std::is_constructible<T, U&&>>::value, - bool>::type = false> - constexpr optional(U&& v); - - template < - typename U = T, - typename std::enable_if< - absl::conjunction<absl::negation<std::is_same< - in_place_t, typename std::decay<U>::type>>, - absl::negation<std::is_same< - optional<T>, typename std::decay<U>::type>>, - absl::negation<std::is_convertible<U&&, T>>, - std::is_constructible<T, U&&>>::value, - bool>::type = false> - explicit constexpr optional(U&& v); - - template <typename U, - typename std::enable_if< - absl::conjunction< - absl::negation<std::is_same<T, U>>, - std::is_constructible<T, const U&>, - absl::negation< - optional_internal:: - is_constructible_convertible_from_optional<T, U>>, - std::is_convertible<const U&, T>>::value, - bool>::type = false> - optional(const optional<U>& rhs); - - template <typename U, - typename std::enable_if< - absl::conjunction< - absl::negation<std::is_same<T, U>>, - std::is_constructible<T, const U&>, - absl::negation< - optional_internal:: - is_constructible_convertible_from_optional<T, U>>, - absl::negation<std::is_convertible<const U&, T>>>::value, - bool>::type = false> - explicit optional(const optional<U>& rhs); - - template < - typename U, - typename std::enable_if< - absl::conjunction< - absl::negation<std::is_same<T, U>>, std::is_constructible<T, U&&>, - absl::negation< - optional_internal::is_constructible_convertible_from_optional< - T, U>>, - std::is_convertible<U&&, T>>::value, - bool>::type = false> - optional(optional<U>&& rhs); - - template < - typename U, - typename std::enable_if< - absl::conjunction< - absl::negation<std::is_same<T, U>>, std::is_constructible<T, U&&>, - absl::negation< - optional_internal::is_constructible_convertible_from_optional< - T, U>>, - absl::negation<std::is_convertible<U&&, T>>>::value, - bool>::type = false> - explicit optional(optional<U>&& rhs); - - optional& operator=(nullopt_t) noexcept; - - optional& operator=(const optional& src); - - optional& operator=(optional&& src); - - template < - typename U = T, - typename = typename std::enable_if<absl::conjunction< - absl::negation< - std::is_same<optional<T>, typename std::decay<U>::type>>, - absl::negation< - absl::conjunction<std::is_scalar<T>, - std::is_same<T, typename std::decay<U>::type>>>, - std::is_constructible<T, U>, std::is_assignable<T&, U>>::value>::type> - optional& operator=(U&& v); - - template < - typename U, - typename = typename std::enable_if<absl::conjunction< - absl::negation<std::is_same<T, U>>, - std::is_constructible<T, const U&>, std::is_assignable<T&, const U&>, - absl::negation< - optional_internal:: - is_constructible_convertible_assignable_from_optional< - T, U>>>::value>::type> - optional& operator=(const optional<U>& rhs); - - template <typename U, - typename = typename std::enable_if<absl::conjunction< - absl::negation<std::is_same<T, U>>, std::is_constructible<T, U>, - std::is_assignable<T&, U>, - absl::negation< - optional_internal:: - is_constructible_convertible_assignable_from_optional< - T, U>>>::value>::type> - optional& operator=(optional<U>&& rhs); - - const T& operator*() const&; - T& operator*() &; - const T&& operator*() const&&; - T&& operator*() &&; - - const T* operator->() const; - T* operator->(); - - const T& value() const&; - T& value() &; - const T&& value() const&&; - T&& value() &&; - - template <typename U> - constexpr T value_or(U&& v) const&; - template <typename U> - T value_or(U&& v) &&; - - template <typename... Args> - T& emplace(Args&&... args); - - template <typename U, typename... Args> - T& emplace(std::initializer_list<U> ilist, Args&&... args); - - void reset() noexcept; - - constexpr explicit operator bool() const noexcept; - constexpr bool has_value() const noexcept; - - void swap(optional& rhs) noexcept; -}; - -template <typename T> -constexpr optional<typename std::decay<T>::type> make_optional(T&& v); - -template <typename T, typename... Args> -constexpr optional<T> make_optional(Args&&... args); - -template <typename T, typename U, typename... Args> -constexpr optional<T> make_optional(std::initializer_list<U> il, - Args&&... args); - -template <typename T, typename U> -constexpr bool operator==(const optional<T> &lhs, const optional<U> &rhs); -template <typename T, typename U> -constexpr bool operator!=(const optional<T> &lhs, const optional<U> &rhs); - -template <typename T> -constexpr bool operator==(const optional<T> &opt, nullopt_t); -template <typename T> -constexpr bool operator==(nullopt_t, const optional<T> &opt); -template <typename T> -constexpr bool operator!=(const optional<T> &opt, nullopt_t); -template <typename T> -constexpr bool operator!=(nullopt_t, const optional<T> &opt); - -template <typename T, typename U> -constexpr bool operator==(const optional<T> &opt, const U &value); -template <typename T, typename U> -constexpr bool operator==(const T &value, const optional<U> &opt); -template <typename T, typename U> -constexpr bool operator!=(const optional<T> &opt, const U &value); -template <typename T, typename U> -constexpr bool operator!=(const T &value, const optional<U> &opt); - -} // namespace absl -)"; - -static constexpr char BaseOptionalHeader[] = R"( -#include "std_initializer_list.h" -#include "std_type_traits.h" -#include "std_utility.h" - -namespace base { - -struct in_place_t {}; -constexpr in_place_t in_place; - -struct nullopt_t { - constexpr explicit nullopt_t() {} -}; -constexpr nullopt_t nullopt; - -template <typename T> -class Optional; - -namespace internal { - -template <typename T> -using RemoveCvRefT = std::remove_cv_t<std::remove_reference_t<T>>; - -template <typename T, typename U> -struct IsConvertibleFromOptional - : std::integral_constant< - bool, std::is_constructible<T, Optional<U>&>::value || - std::is_constructible<T, const Optional<U>&>::value || - std::is_constructible<T, Optional<U>&&>::value || - std::is_constructible<T, const Optional<U>&&>::value || - std::is_convertible<Optional<U>&, T>::value || - std::is_convertible<const Optional<U>&, T>::value || - std::is_convertible<Optional<U>&&, T>::value || - std::is_convertible<const Optional<U>&&, T>::value> {}; - -template <typename T, typename U> -struct IsAssignableFromOptional - : std::integral_constant< - bool, IsConvertibleFromOptional<T, U>::value || - std::is_assignable<T&, Optional<U>&>::value || - std::is_assignable<T&, const Optional<U>&>::value || - std::is_assignable<T&, Optional<U>&&>::value || - std::is_assignable<T&, const Optional<U>&&>::value> {}; - -} // namespace internal - -template <typename T> -class Optional { - public: - using value_type = T; - - constexpr Optional() = default; - constexpr Optional(const Optional& other) noexcept = default; - constexpr Optional(Optional&& other) noexcept = default; - - constexpr Optional(nullopt_t); - - template <typename U, - typename std::enable_if< - std::is_constructible<T, const U&>::value && - !internal::IsConvertibleFromOptional<T, U>::value && - std::is_convertible<const U&, T>::value, - bool>::type = false> - Optional(const Optional<U>& other) noexcept; - - template <typename U, - typename std::enable_if< - std::is_constructible<T, const U&>::value && - !internal::IsConvertibleFromOptional<T, U>::value && - !std::is_convertible<const U&, T>::value, - bool>::type = false> - explicit Optional(const Optional<U>& other) noexcept; - - template <typename U, - typename std::enable_if< - std::is_constructible<T, U&&>::value && - !internal::IsConvertibleFromOptional<T, U>::value && - std::is_convertible<U&&, T>::value, - bool>::type = false> - Optional(Optional<U>&& other) noexcept; - - template <typename U, - typename std::enable_if< - std::is_constructible<T, U&&>::value && - !internal::IsConvertibleFromOptional<T, U>::value && - !std::is_convertible<U&&, T>::value, - bool>::type = false> - explicit Optional(Optional<U>&& other) noexcept; - - template <class... Args> - constexpr explicit Optional(in_place_t, Args&&... args); - - template <class U, class... Args, - class = typename std::enable_if<std::is_constructible< - value_type, std::initializer_list<U>&, Args...>::value>::type> - constexpr explicit Optional(in_place_t, std::initializer_list<U> il, - Args&&... args); - - template < - typename U = value_type, - typename std::enable_if< - std::is_constructible<T, U&&>::value && - !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value && - !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && - std::is_convertible<U&&, T>::value, - bool>::type = false> - constexpr Optional(U&& value); - - template < - typename U = value_type, - typename std::enable_if< - std::is_constructible<T, U&&>::value && - !std::is_same<internal::RemoveCvRefT<U>, in_place_t>::value && - !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && - !std::is_convertible<U&&, T>::value, - bool>::type = false> - constexpr explicit Optional(U&& value); - - Optional& operator=(const Optional& other) noexcept; - - Optional& operator=(Optional&& other) noexcept; - - Optional& operator=(nullopt_t); - - template <typename U> - typename std::enable_if< - !std::is_same<internal::RemoveCvRefT<U>, Optional<T>>::value && - std::is_constructible<T, U>::value && - std::is_assignable<T&, U>::value && - (!std::is_scalar<T>::value || - !std::is_same<typename std::decay<U>::type, T>::value), - Optional&>::type - operator=(U&& value) noexcept; - - template <typename U> - typename std::enable_if<!internal::IsAssignableFromOptional<T, U>::value && - std::is_constructible<T, const U&>::value && - std::is_assignable<T&, const U&>::value, - Optional&>::type - operator=(const Optional<U>& other) noexcept; - - template <typename U> - typename std::enable_if<!internal::IsAssignableFromOptional<T, U>::value && - std::is_constructible<T, U>::value && - std::is_assignable<T&, U>::value, - Optional&>::type - operator=(Optional<U>&& other) noexcept; - - const T& operator*() const&; - T& operator*() &; - const T&& operator*() const&&; - T&& operator*() &&; - - const T* operator->() const; - T* operator->(); - - const T& value() const&; - T& value() &; - const T&& value() const&&; - T&& value() &&; - - template <typename U> - constexpr T value_or(U&& v) const&; - template <typename U> - T value_or(U&& v) &&; - - template <typename... Args> - T& emplace(Args&&... args); - - template <typename U, typename... Args> - T& emplace(std::initializer_list<U> ilist, Args&&... args); - - void reset() noexcept; - - constexpr explicit operator bool() const noexcept; - constexpr bool has_value() const noexcept; - - void swap(Optional& other); -}; - -template <typename T> -constexpr Optional<typename std::decay<T>::type> make_optional(T&& v); - -template <typename T, typename... Args> -constexpr Optional<T> make_optional(Args&&... args); - -template <typename T, typename U, typename... Args> -constexpr Optional<T> make_optional(std::initializer_list<U> il, - Args&&... args); - -template <typename T, typename U> -constexpr bool operator==(const Optional<T> &lhs, const Optional<U> &rhs); -template <typename T, typename U> -constexpr bool operator!=(const Optional<T> &lhs, const Optional<U> &rhs); - -template <typename T> -constexpr bool operator==(const Optional<T> &opt, nullopt_t); -template <typename T> -constexpr bool operator==(nullopt_t, const Optional<T> &opt); -template <typename T> -constexpr bool operator!=(const Optional<T> &opt, nullopt_t); -template <typename T> -constexpr bool operator!=(nullopt_t, const Optional<T> &opt); - -template <typename T, typename U> -constexpr bool operator==(const Optional<T> &opt, const U &value); -template <typename T, typename U> -constexpr bool operator==(const T &value, const Optional<U> &opt); -template <typename T, typename U> -constexpr bool operator!=(const Optional<T> &opt, const U &value); -template <typename T, typename U> -constexpr bool operator!=(const T &value, const Optional<U> &opt); - -} // namespace base -)"; - /// Replaces all occurrences of `Pattern` in `S` with `Replacement`. static void ReplaceAllOccurrences(std::string &S, const std::string &Pattern, const std::string &Replacement) { @@ -1323,16 +98,7 @@ protected: ReplaceAllOccurrences(SourceCode, "$ns", GetParam().NamespaceName); ReplaceAllOccurrences(SourceCode, "$optional", GetParam().TypeName); - std::vector<std::pair<std::string, std::string>> Headers; - Headers.emplace_back("cstddef.h", CSDtdDefHeader); - Headers.emplace_back("std_initializer_list.h", StdInitializerListHeader); - Headers.emplace_back("std_string.h", StdStringHeader); - Headers.emplace_back("std_type_traits.h", StdTypeTraitsHeader); - Headers.emplace_back("std_utility.h", StdUtilityHeader); - Headers.emplace_back("std_optional.h", StdOptionalHeader); - Headers.emplace_back("absl_type_traits.h", AbslTypeTraitsHeader); - Headers.emplace_back("absl_optional.h", AbslOptionalHeader); - Headers.emplace_back("base_optional.h", BaseOptionalHeader); + auto Headers = getMockHeaders(); Headers.emplace_back("unchecked_optional_access_test.h", R"( #include "absl_optional.h" #include "base_optional.h" diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index c21b118..1152466 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -1490,6 +1490,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresExpressions) { EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_RequiresExpressionLParen); EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_RequiresExpressionLBrace); + Tokens = annotate("bool foo{requires { 0; }};"); + ASSERT_EQ(Tokens.size(), 11u) << Tokens; + EXPECT_TOKEN(Tokens[3], tok::kw_requires, TT_RequiresExpression); + EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_RequiresExpressionLBrace); + Tokens = annotate("if (requires(int i) { i + 5; }) return;"); ASSERT_EQ(Tokens.size(), 17u) << Tokens; EXPECT_TOKEN(Tokens[2], tok::kw_requires, TT_RequiresExpression); diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 0e3c9aa2..2adfd6f2 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -458,6 +458,7 @@ struct IntrinsicLibrary { mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>); void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genTransfer(mlir::Type, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 7c5c5fb..5fe2a76 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1020,6 +1020,13 @@ static constexpr IntrinsicHandler handlers[]{ &I::genTMABulkCommitGroup, {{}}, /*isElemental=*/false}, + {"tma_bulk_g2s", + &I::genTMABulkG2S, + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nbytes", asValue}}}, + /*isElemental=*/false}, {"tma_bulk_wait_group", &I::genTMABulkWaitGroup, {{}}, @@ -3200,17 +3207,17 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType, return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox); } -static mlir::Value convertBarrierToLLVM(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value barrier) { +static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value barrier, + mlir::NVVM::NVVMMemorySpace space) { mlir::Value llvmPtr = fir::ConvertOp::create( builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), barrier); mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( builder, loc, - mlir::LLVM::LLVMPointerType::get( - builder.getContext(), - static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Shared)), + mlir::LLVM::LLVMPointerType::get(builder.getContext(), + static_cast<unsigned>(space)), llvmPtr); return addrCast; } @@ -3220,7 +3227,8 @@ mlir::Value IntrinsicLibrary::genBarrierArrive(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 1); - mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, barrier) .getResult(); @@ -3231,7 +3239,8 @@ mlir::Value IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { assert(args.size() == 2); - mlir::Value barrier = convertBarrierToLLVM(builder, loc, args[0]); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and // currently just the sink symbol `_`. @@ -3244,8 +3253,8 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, // BARRIER_INIT (CUDA) void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 2); - mlir::Value barrier = - convertBarrierToLLVM(builder, loc, fir::getBase(args[0])); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); mlir::NVVM::MBarrierInitSharedOp::create(builder, loc, barrier, fir::getBase(args[1]), {}); auto kind = mlir::NVVM::ProxyKindAttr::get( @@ -9204,6 +9213,20 @@ void IntrinsicLibrary::genTMABulkCommitGroup( mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); } +// TMA_BULK_G2S (CUDA) +void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = + convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), + mlir::NVVM::NVVMMemorySpace::SharedCluster); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( + builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); +} + // TMA_BULK_WAIT_GROUP (CUDA) void IntrinsicLibrary::genTMABulkWaitGroup( llvm::ArrayRef<fir::ExtendedValue> args) { diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index db20685..a8b9aa8 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -2023,6 +2023,17 @@ implicit none end subroutine end interface + ! Generic load, count is in bytes + interface + attributes(device) subroutine tma_bulk_g2s(barrier, src, dst, nbytes) + !dir$ ignore_tkr src, dst + integer(8), shared :: barrier + integer(4), device :: src(*) + integer(4), shared :: dst(*) + integer(4), value :: nbytes + end subroutine + end interface + contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 697b17b..83ee011 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -437,3 +437,14 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma() ! CHECK: nvvm.cp.async.bulk.commit.group ! CHECK: nvvm.cp.async.bulk.wait_group 0 + +attributes(global) subroutine test_bulk_g2s(c, a, b, n) + real(8), device :: a(*) + real(8), shared :: tmpa(1024) + integer(8), shared :: barrier1 + integer(4) :: tx_count + call tma_bulk_g2s(barrier1, a(j), tmpa, tx_count) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_bulk_g2s +! CHECK: nvvm.cp.async.bulk.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : <7>, <1> diff --git a/libunwind/test/aix_signal_unwind.pass.sh.S b/libunwind/test/aix_signal_unwind.pass.sh.S index bd0b8ac..0565757 100644 --- a/libunwind/test/aix_signal_unwind.pass.sh.S +++ b/libunwind/test/aix_signal_unwind.pass.sh.S @@ -169,7 +169,7 @@ L..abc0: .vbyte 4, 0x00000000 # Traceback table begin .byte 0x00 # Version = 0 .byte 0x09 # Language = CPlusPlus - .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue + .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue # +HasTraceBackTableOffset, -IsInternalProcedure # -HasControlledStorage, -IsTOCless # -IsFloatingPointPresent @@ -219,7 +219,7 @@ L..abc0: .vbyte 4, 0x00000000 # Traceback table begin .byte 0x00 # Version = 0 .byte 0x09 # Language = CPlusPlus - .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue + .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue # +HasTraceBackTableOffset, -IsInternalProcedure # -HasControlledStorage, -IsTOCless # -IsFloatingPointPresent diff --git a/lld/test/ELF/lto/amdgcn-oses.ll b/lld/test/ELF/lto/amdgcn-oses.ll index 7a74d03..b3caf0f 100644 --- a/lld/test/ELF/lto/amdgcn-oses.ll +++ b/lld/test/ELF/lto/amdgcn-oses.ll @@ -25,7 +25,7 @@ ;--- amdhsa.ll target triple = "amdgcn-amd-amdhsa" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} @@ -36,7 +36,7 @@ define void @_start() { ;--- amdpal.ll target triple = "amdgcn-amd-amdpal" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" define amdgpu_cs void @_start() { ret void @@ -44,7 +44,7 @@ define amdgpu_cs void @_start() { ;--- mesa3d.ll target triple = "amdgcn-amd-mesa3d" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" define void @_start() { ret void diff --git a/lld/test/ELF/lto/amdgcn.ll b/lld/test/ELF/lto/amdgcn.ll index 4281e20..186185c 100644 --- a/lld/test/ELF/lto/amdgcn.ll +++ b/lld/test/ELF/lto/amdgcn.ll @@ -5,7 +5,7 @@ ; Make sure the amdgcn triple is handled target triple = "amdgcn-amd-amdhsa" -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" define void @_start() { ret void diff --git a/lld/test/ELF/lto/r600.ll b/lld/test/ELF/lto/r600.ll index 1c95edc..937c1bd 100644 --- a/lld/test/ELF/lto/r600.ll +++ b/lld/test/ELF/lto/r600.ll @@ -5,7 +5,7 @@ ; Make sure the r600 triple is handled target triple = "r600-mesa-mesa3d" -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target datalayout = "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" define void @_start() { ret void diff --git a/lldb/test/Shell/Expr/TestExprLanguageNote.test b/lldb/test/Shell/Expr/TestExprLanguageNote.test index b4387bf..7d8c702 100644 --- a/lldb/test/Shell/Expr/TestExprLanguageNote.test +++ b/lldb/test/Shell/Expr/TestExprLanguageNote.test @@ -1,3 +1,5 @@ +# REQUIRES: (system-windows && lld) || !system-windows + # RUN: split-file %s %t # RUN: %clang_host -g %t/main.cpp -o %t.out # @@ -11,7 +13,7 @@ int main() { int x = 10; - __builtin_debugtrap(); + return x; } #--- with-target.input @@ -21,6 +23,7 @@ expr blah # CHECK-TARGET: (lldb) expr # CHECK-TARGET: note: Falling back to default language. Ran expression as 'Objective C++'. +b 4 run expr blah diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp index b455112..b939335 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp @@ -461,7 +461,7 @@ bool fromJSON(const json::Value &Params, DataBreakpointInfoArguments &DBIA, json::Path P) { json::ObjectMapper O(Params, P); return O && O.map("variablesReference", DBIA.variablesReference) && - O.map("name", DBIA.name) && O.map("frameId", DBIA.frameId) && + O.map("name", DBIA.name) && O.mapOptional("frameId", DBIA.frameId) && O.map("bytes", DBIA.bytes) && O.map("asAddress", DBIA.asAddress) && O.map("mode", DBIA.mode); } diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp index 0989a5b..a5ae856 100644 --- a/lldb/unittests/DAP/ProtocolTypesTest.cpp +++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp @@ -1101,3 +1101,28 @@ TEST(ProtocolTypesTest, MemoryEventBody) { })"; EXPECT_EQ(json, pp(body)); } + +TEST(ProtocolTypesTest, DataBreakpointInfoArguments) { + llvm::Expected<DataBreakpointInfoArguments> expected = + parse<DataBreakpointInfoArguments>(R"({ + "name": "data", + "variablesReference": 8, + "frameId": 9, + "bytes": 10, + "asAddress": false, + "mode": "source" + })"); + ASSERT_THAT_EXPECTED(expected, llvm::Succeeded()); + EXPECT_EQ(expected->name, "data"); + EXPECT_EQ(expected->variablesReference, 8); + EXPECT_EQ(expected->frameId, 9u); + EXPECT_EQ(expected->bytes, 10); + EXPECT_EQ(expected->asAddress, false); + EXPECT_EQ(expected->mode, "source"); + + // Check required keys. + EXPECT_THAT_EXPECTED(parse<DataBreakpointInfoArguments>(R"({})"), + FailedWithMessage("missing value at (root).name")); + EXPECT_THAT_EXPECTED(parse<DataBreakpointInfoArguments>(R"({"name":"data"})"), + llvm::Succeeded()); +} diff --git a/llvm/docs/AArch64SME.rst b/llvm/docs/AArch64SME.rst index 47ed7bc..327f9dc 100644 --- a/llvm/docs/AArch64SME.rst +++ b/llvm/docs/AArch64SME.rst @@ -124,7 +124,7 @@ In this table, we use the following abbreviations: either 0 or 1 on entry, and is unchanged on return). Functions with ``__attribute__((arm_locally_streaming))`` are excluded from this -table because for the caller the attribute is synonymous to 'streaming', and +table because for the caller the attribute is synonymous with 'streaming', and for the callee it is merely an implementation detail that is explicitly not exposed to the caller. @@ -158,7 +158,7 @@ the function's body, so that it can place the mode changes in exactly the right position. The suitable place to do this seems to be SelectionDAG, where it lowers the call's arguments/return values to implement the specified calling convention. SelectionDAG provides Chains and Glue to specify the order of operations and give -preliminary control over the instruction's scheduling. +preliminary control over instruction scheduling. Example of preserving state @@ -232,8 +232,8 @@ implement transitions from ``SC -> N`` and ``SC -> S``. Unchained Function calls ------------------------ When a function with "``aarch64_pstate_sm_enabled``" calls a function that is not -streaming compatible, the compiler has to insert a SMSTOP before the call and -insert a SMSTOP after the call. +streaming compatible, the compiler has to insert an SMSTOP before the call and +insert an SMSTOP after the call. If the function that is called is an intrinsic with no side-effects which in turn is lowered to a function call (e.g., ``@llvm.cos()``), then the call to @@ -388,7 +388,7 @@ The value of PSTATE.SM is not controlled by the feature flags, but rather by the function attributes. This means that we can compile for '``+sme``', and the compiler will code-generate any instructions, even if they are not legal under the requested streaming mode. The compiler needs to use the function attributes to ensure the -compiler doesn't do transformations under the assumption that certain operations +compiler doesn't perform transformations under the assumption that certain operations are available at runtime. We made a conscious choice not to model this with feature flags because we @@ -399,11 +399,11 @@ and `D121208 <https://reviews.llvm.org/D121208>`_) because of limitations in TableGen. As a first step, this means we'll disable vectorization (LoopVectorize/SLP) -entirely when the a function has either of the ``aarch64_pstate_sm_enabled``, +entirely when a function has either of the ``aarch64_pstate_sm_enabled``, ``aarch64_pstate_sm_body`` or ``aarch64_pstate_sm_compatible`` attributes, in order to avoid the use of vector instructions. -Later on we'll aim to relax these restrictions to enable scalable +Later on, we'll aim to relax these restrictions to enable scalable auto-vectorization with a subset of streaming-compatible instructions, but that requires changes to the CostModel, Legalization and SelectionDAG lowering. @@ -416,7 +416,7 @@ Other things to consider ------------------------ * Inlining must be disabled when the call-site needs to toggle PSTATE.SM or - when the callee's function body is executed in a different streaming mode than + when the callee's function body is executed in a different streaming mode from its caller. This is needed because function calls are the boundaries for streaming mode changes. @@ -434,8 +434,8 @@ lazy-save mechanism for calls to private-ZA functions (i.e. functions that may either directly or indirectly clobber ZA state). For the purpose of handling functions marked with ``aarch64_new_za``, -we have introduced a new LLVM IR pass (SMEABIPass) that is run just before -SelectionDAG. Any such functions dealt with by this pass are marked with +we have introduced a new LLVM IR pass (SMEABIPass) that runs just before +SelectionDAG. Any such functions handled by this pass are marked with ``aarch64_expanded_pstate_za``. Setting up a lazy-save @@ -458,7 +458,7 @@ AArch64 Predicate-as-Counter Type The predicate-as-counter type represents the type of a predicate-as-counter value held in an AArch64 SVE predicate register. Such a value contains information about the number of active lanes, the element width and a bit that -tells whether the generated mask should be inverted. ACLE intrinsics should be +indicates whether the generated mask should be inverted. ACLE intrinsics should be used to move the predicate-as-counter value to/from a predicate vector. There are certain limitations on the type: @@ -466,7 +466,7 @@ There are certain limitations on the type: * The type can be used for function parameters and return values. * The supported LLVM operations on this type are limited to ``load``, ``store``, - ``phi``, ``select`` and ``alloca`` instructions. + ``phi``, ``select``, and ``alloca`` instructions. The predicate-as-counter type is a scalable type. diff --git a/llvm/docs/HowToBuildOnARM.rst b/llvm/docs/HowToBuildOnARM.rst index 9eb6b5a..30e3744 100644 --- a/llvm/docs/HowToBuildOnARM.rst +++ b/llvm/docs/HowToBuildOnARM.rst @@ -23,10 +23,10 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips. choices when using CMake. Autoconf usage is deprecated as of 3.8. Building LLVM/Clang in ``Release`` mode is preferred since it consumes - a lot less memory. Otherwise, the building process will very likely + a lot less memory. Otherwise, the build process will very likely fail due to insufficient memory. It's also a lot quicker to only build the relevant back-ends (ARM and AArch64), since it's very unlikely that - you'll use an ARM board to cross-compile to other arches. If you're + you'll use an ARM board to cross-compile to other architectures. If you're running Compiler-RT tests, also include the x86 back-end, or some tests will fail. @@ -48,15 +48,15 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips. ``make -jN check-all`` or ``ninja check-all`` will run all compiler tests. For running the test suite, please refer to :doc:`TestingGuide`. -#. If you are building LLVM/Clang on an ARM board with 1G of memory or less, - please use ``gold`` rather then GNU ``ld``. In any case it is probably a good +#. If you are building LLVM/Clang on an ARM board with 1 GB of memory or less, + please use ``gold`` rather than GNU ``ld``. In any case, it is probably a good idea to set up a swap partition, too. .. code-block:: bash $ sudo ln -sf /usr/bin/ld /usr/bin/ld.gold -#. ARM development boards can be unstable and you may experience that cores +#. ARM development boards can be unstable, and you may experience that cores are disappearing, caches being flushed on every big.LITTLE switch, and other similar issues. To help ease the effect of this, set the Linux scheduler to "performance" on **all** cores using this little script: @@ -73,12 +73,12 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips. problems. #. Running the build on SD cards is ok, but they are more prone to failures - than good quality USB sticks, and those are more prone to failures than - external hard-drives (those are also a lot faster). So, at least, you + than good-quality USB sticks, and those are more prone to failures than + external hard drives (those are also a lot faster). So, at least, you should consider to buy a fast USB stick. On systems with a fast eMMC, that's a good option too. #. Make sure you have a decent power supply (dozens of dollars worth) that can - provide *at least* 4 amperes, this is especially important if you use USB - devices with your board. Externally powered USB/SATA harddrives are even + provide *at least* 4 amperes. This is especially important if you use USB + devices with your board. Externally powered USB/SATA hard drives are even better than having a good power supply. diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index 6bc6284..a1bfce7 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -1313,7 +1313,7 @@ public: /// Assuming this is an IEEE-754 NaN value, quiet its signaling bit. /// This preserves the sign and payload bits. - APFloat makeQuiet() const { + [[nodiscard]] APFloat makeQuiet() const { APFloat Result(*this); Result.getIEEE().makeQuiet(); return Result; diff --git a/llvm/include/llvm/BinaryFormat/XCOFF.h b/llvm/include/llvm/BinaryFormat/XCOFF.h index 24d5c74..9f571b9 100644 --- a/llvm/include/llvm/BinaryFormat/XCOFF.h +++ b/llvm/include/llvm/BinaryFormat/XCOFF.h @@ -412,7 +412,7 @@ struct TracebackTable { static constexpr uint8_t LanguageIdShift = 16; // Byte 3 - static constexpr uint32_t IsGlobaLinkageMask = 0x0000'8000; + static constexpr uint32_t IsGlobalLinkageMask = 0x0000'8000; static constexpr uint32_t IsOutOfLineEpilogOrPrologueMask = 0x0000'4000; static constexpr uint32_t HasTraceBackTableOffsetMask = 0x0000'2000; static constexpr uint32_t IsInternalProcedureMask = 0x0000'1000; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h index 44ef289..41c3089 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h @@ -51,7 +51,11 @@ public: virtual void reserve(size_t NumBytes, OnReservedFunction OnReserved) = 0; /// Provides working memory - virtual char *prepare(ExecutorAddr Addr, size_t ContentSize) = 0; + /// The LinkGraph parameter is included to allow implementations to allocate + /// working memory from the LinkGraph's allocator, in which case it will be + /// deallocated when the LinkGraph is destroyed. + virtual char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr, + size_t ContentSize) = 0; using OnInitializedFunction = unique_function<void(Expected<ExecutorAddr>)>; @@ -92,7 +96,8 @@ public: void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override; - char *prepare(ExecutorAddr Addr, size_t ContentSize) override; + char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr, + size_t ContentSize) override; void deinitialize(ArrayRef<ExecutorAddr> Allocations, OnDeinitializedFunction OnDeInitialized) override; @@ -142,7 +147,8 @@ public: void reserve(size_t NumBytes, OnReservedFunction OnReserved) override; - char *prepare(ExecutorAddr Addr, size_t ContentSize) override; + char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr, + size_t ContentSize) override; void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override; diff --git a/llvm/include/llvm/Support/BinaryStreamWriter.h b/llvm/include/llvm/Support/BinaryStreamWriter.h index dddf53b..39ce0b6 100644 --- a/llvm/include/llvm/Support/BinaryStreamWriter.h +++ b/llvm/include/llvm/Support/BinaryStreamWriter.h @@ -10,6 +10,7 @@ #define LLVM_SUPPORT_BINARYSTREAMWRITER_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/BinaryStreamArray.h" #include "llvm/Support/BinaryStreamError.h" @@ -69,8 +70,7 @@ public: static_assert(std::is_enum<T>::value, "Cannot call writeEnum with non-Enum type"); - using U = std::underlying_type_t<T>; - return writeInteger<U>(static_cast<U>(Num)); + return writeInteger(llvm::to_underlying(Num)); } /// Write the unsigned integer Value to the underlying stream using ULEB128 diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h index 94080e8..7b87fda 100644 --- a/llvm/include/llvm/Support/ScopedPrinter.h +++ b/llvm/include/llvm/Support/ScopedPrinter.h @@ -11,6 +11,7 @@ #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -57,8 +58,7 @@ struct HexNumber { HexNumber(unsigned long Value) : Value(Value) {} HexNumber(unsigned long long Value) : Value(Value) {} template <typename EnumT, typename = std::enable_if_t<std::is_enum_v<EnumT>>> - HexNumber(EnumT Value) - : HexNumber(static_cast<std::underlying_type_t<EnumT>>(Value)) {} + HexNumber(EnumT Value) : HexNumber(llvm::to_underlying(Value)) {} uint64_t Value; }; @@ -84,7 +84,7 @@ struct FlagEntry { : Name(Name), Value(Value) {} template <typename EnumT, typename = std::enable_if_t<std::is_enum_v<EnumT>>> FlagEntry(StringRef Name, EnumT Value) - : FlagEntry(Name, static_cast<std::underlying_type_t<EnumT>>(Value)) {} + : FlagEntry(Name, llvm::to_underlying(Value)) {} StringRef Name; uint64_t Value; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 30bcff7..b5b4cd9 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15633,47 +15633,34 @@ void ScalarEvolution::LoopGuards::collectFromBlock( return false; }; - // Checks whether Expr is a non-negative constant, and Divisor is a positive - // constant, and returns their APInt in ExprVal and in DivisorVal. - auto GetNonNegExprAndPosDivisor = [&](const SCEV *Expr, const SCEV *Divisor, - APInt &ExprVal, APInt &DivisorVal) { - auto *ConstExpr = dyn_cast<SCEVConstant>(Expr); - auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor); - if (!ConstExpr || !ConstDivisor) - return false; - ExprVal = ConstExpr->getAPInt(); - DivisorVal = ConstDivisor->getAPInt(); - return ExprVal.isNonNegative() && !DivisorVal.isNonPositive(); - }; - // Return a new SCEV that modifies \p Expr to the closest number divides by - // \p Divisor and greater or equal than Expr. - // For now, only handle constant Expr and Divisor. + // \p Divisor and greater or equal than Expr. For now, only handle constant + // Expr. auto GetNextSCEVDividesByDivisor = [&](const SCEV *Expr, - const SCEV *Divisor) { - APInt ExprVal; - APInt DivisorVal; - if (!GetNonNegExprAndPosDivisor(Expr, Divisor, ExprVal, DivisorVal)) + const APInt &DivisorVal) { + const APInt *ExprVal; + if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() || + DivisorVal.isNonPositive()) return Expr; - APInt Rem = ExprVal.urem(DivisorVal); - if (!Rem.isZero()) - // return the SCEV: Expr + Divisor - Expr % Divisor - return SE.getConstant(ExprVal + DivisorVal - Rem); - return Expr; + APInt Rem = ExprVal->urem(DivisorVal); + if (Rem.isZero()) + return Expr; + // return the SCEV: Expr + Divisor - Expr % Divisor + return SE.getConstant(*ExprVal + DivisorVal - Rem); }; // Return a new SCEV that modifies \p Expr to the closest number divides by - // \p Divisor and less or equal than Expr. - // For now, only handle constant Expr and Divisor. + // \p Divisor and less or equal than Expr. For now, only handle constant + // Expr. auto GetPreviousSCEVDividesByDivisor = [&](const SCEV *Expr, - const SCEV *Divisor) { - APInt ExprVal; - APInt DivisorVal; - if (!GetNonNegExprAndPosDivisor(Expr, Divisor, ExprVal, DivisorVal)) + const APInt &DivisorVal) { + const APInt *ExprVal; + if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() || + DivisorVal.isNonPositive()) return Expr; - APInt Rem = ExprVal.urem(DivisorVal); + APInt Rem = ExprVal->urem(DivisorVal); // return the SCEV: Expr - Expr % Divisor - return SE.getConstant(ExprVal - Rem); + return SE.getConstant(*ExprVal - Rem); }; // Apply divisibilty by \p Divisor on MinMaxExpr with constant values, @@ -15682,6 +15669,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock( std::function<const SCEV *(const SCEV *, const SCEV *)> ApplyDivisibiltyOnMinMaxExpr = [&](const SCEV *MinMaxExpr, const SCEV *Divisor) { + auto *ConstDivisor = dyn_cast<SCEVConstant>(Divisor); + if (!ConstDivisor) + return MinMaxExpr; + const APInt &DivisorVal = ConstDivisor->getAPInt(); + const SCEV *MinMaxLHS = nullptr, *MinMaxRHS = nullptr; SCEVTypes SCTy; if (!IsMinMaxSCEVWithNonNegativeConstant(MinMaxExpr, SCTy, MinMaxLHS, @@ -15692,8 +15684,8 @@ void ScalarEvolution::LoopGuards::collectFromBlock( assert(SE.isKnownNonNegative(MinMaxLHS) && "Expected non-negative operand!"); auto *DivisibleExpr = - IsMin ? GetPreviousSCEVDividesByDivisor(MinMaxLHS, Divisor) - : GetNextSCEVDividesByDivisor(MinMaxLHS, Divisor); + IsMin ? GetPreviousSCEVDividesByDivisor(MinMaxLHS, DivisorVal) + : GetNextSCEVDividesByDivisor(MinMaxLHS, DivisorVal); SmallVector<const SCEV *> Ops = { ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr}; return SE.getMinMaxExpr(SCTy, Ops); @@ -15750,10 +15742,7 @@ void ScalarEvolution::LoopGuards::collectFromBlock( }; const SCEV *RewrittenLHS = GetMaybeRewritten(LHS); - const SCEV *DividesBy = nullptr; - const APInt &Multiple = SE.getConstantMultiple(RewrittenLHS); - if (!Multiple.isOne()) - DividesBy = SE.getConstant(Multiple); + const APInt &DividesBy = SE.getConstantMultiple(RewrittenLHS); // Collect rewrites for LHS and its transitive operands based on the // condition. @@ -15775,21 +15764,21 @@ void ScalarEvolution::LoopGuards::collectFromBlock( [[fallthrough]]; case CmpInst::ICMP_SLT: { RHS = SE.getMinusSCEV(RHS, One); - RHS = DividesBy ? GetPreviousSCEVDividesByDivisor(RHS, DividesBy) : RHS; + RHS = GetPreviousSCEVDividesByDivisor(RHS, DividesBy); break; } case CmpInst::ICMP_UGT: case CmpInst::ICMP_SGT: RHS = SE.getAddExpr(RHS, One); - RHS = DividesBy ? GetNextSCEVDividesByDivisor(RHS, DividesBy) : RHS; + RHS = GetNextSCEVDividesByDivisor(RHS, DividesBy); break; case CmpInst::ICMP_ULE: case CmpInst::ICMP_SLE: - RHS = DividesBy ? GetPreviousSCEVDividesByDivisor(RHS, DividesBy) : RHS; + RHS = GetPreviousSCEVDividesByDivisor(RHS, DividesBy); break; case CmpInst::ICMP_UGE: case CmpInst::ICMP_SGE: - RHS = DividesBy ? GetNextSCEVDividesByDivisor(RHS, DividesBy) : RHS; + RHS = GetNextSCEVDividesByDivisor(RHS, DividesBy); break; default: break; @@ -15843,7 +15832,7 @@ void ScalarEvolution::LoopGuards::collectFromBlock( case CmpInst::ICMP_NE: if (match(RHS, m_scev_Zero())) { const SCEV *OneAlignedUp = - DividesBy ? GetNextSCEVDividesByDivisor(One, DividesBy) : One; + GetNextSCEVDividesByDivisor(One, DividesBy); To = SE.getUMaxExpr(FromRewritten, OneAlignedUp); } break; diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp index 9cc6c6a..c500357 100644 --- a/llvm/lib/CodeGen/ExpandFp.cpp +++ b/llvm/lib/CodeGen/ExpandFp.cpp @@ -82,7 +82,7 @@ public: } static FRemExpander create(IRBuilder<> &B, Type *Ty) { - assert(canExpandType(Ty)); + assert(canExpandType(Ty) && "Expected supported floating point type"); // The type to use for the computation of the remainder. This may be // wider than the input/result type which affects the ... @@ -356,8 +356,9 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y, static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) { LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n'); - Type *ReturnTy = I.getType(); - assert(FRemExpander::canExpandType(ReturnTy->getScalarType())); + Type *Ty = I.getType(); + assert(FRemExpander::canExpandType(Ty) && + "Expected supported floating point type"); FastMathFlags FMF = I.getFastMathFlags(); // TODO Make use of those flags for optimization? @@ -368,32 +369,10 @@ static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) { B.setFastMathFlags(FMF); B.SetCurrentDebugLocation(I.getDebugLoc()); - Type *ElemTy = ReturnTy->getScalarType(); - const FRemExpander Expander = FRemExpander::create(B, ElemTy); - - Value *Ret; - if (ReturnTy->isFloatingPointTy()) - Ret = FMF.approxFunc() - ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1)) - : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ); - else { - auto *VecTy = cast<FixedVectorType>(ReturnTy); - - // This could use SplitBlockAndInsertForEachLane but the interface - // is a bit awkward for a constant number of elements and it will - // boil down to the same code. - // TODO Expand the FRem instruction only once and reuse the code. - Value *Nums = I.getOperand(0); - Value *Denums = I.getOperand(1); - Ret = PoisonValue::get(I.getType()); - for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) { - Value *Num = B.CreateExtractElement(Nums, I); - Value *Denum = B.CreateExtractElement(Denums, I); - Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum) - : Expander.buildFRem(Num, Denum, SQ); - Ret = B.CreateInsertElement(Ret, Rem, I); - } - } + const FRemExpander Expander = FRemExpander::create(B, Ty); + Value *Ret = FMF.approxFunc() + ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1)) + : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ); I.replaceAllUsesWith(Ret); Ret->takeName(&I); @@ -939,7 +918,8 @@ static void expandIToFP(Instruction *IToFP) { IToFP->eraseFromParent(); } -static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) { +static void scalarize(Instruction *I, + SmallVectorImpl<Instruction *> &Worklist) { VectorType *VTy = cast<FixedVectorType>(I->getType()); IRBuilder<> Builder(I); @@ -948,12 +928,25 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) { Value *Result = PoisonValue::get(VTy); for (unsigned Idx = 0; Idx < NumElements; ++Idx) { Value *Ext = Builder.CreateExtractElement(I->getOperand(0), Idx); - Value *Cast = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext, - I->getType()->getScalarType()); - Result = Builder.CreateInsertElement(Result, Cast, Idx); - if (isa<Instruction>(Cast)) - Replace.push_back(cast<Instruction>(Cast)); + + Value *NewOp = nullptr; + if (auto *BinOp = dyn_cast<BinaryOperator>(I)) + NewOp = Builder.CreateBinOp( + BinOp->getOpcode(), Ext, + Builder.CreateExtractElement(I->getOperand(1), Idx)); + else if (auto *CastI = dyn_cast<CastInst>(I)) + NewOp = Builder.CreateCast(CastI->getOpcode(), Ext, + I->getType()->getScalarType()); + else + llvm_unreachable("Unsupported instruction type"); + + Result = Builder.CreateInsertElement(Result, NewOp, Idx); + if (auto *ScalarizedI = dyn_cast<Instruction>(NewOp)) { + ScalarizedI->copyIRFlags(I, true); + Worklist.push_back(ScalarizedI); + } } + I->replaceAllUsesWith(Result); I->dropAllReferences(); I->eraseFromParent(); @@ -989,10 +982,17 @@ static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) { return TLI.getLibcallName(fremToLibcall(Ty->getScalarType())); } +static void addToWorklist(Instruction &I, + SmallVector<Instruction *, 4> &Worklist) { + if (I.getOperand(0)->getType()->isVectorTy()) + scalarize(&I, Worklist); + else + Worklist.push_back(&I); +} + static bool runImpl(Function &F, const TargetLowering &TLI, AssumptionCache *AC) { - SmallVector<Instruction *, 4> Replace; - SmallVector<Instruction *, 4> ReplaceVector; + SmallVector<Instruction *, 4> Worklist; bool Modified = false; unsigned MaxLegalFpConvertBitWidth = @@ -1003,56 +1003,39 @@ static bool runImpl(Function &F, const TargetLowering &TLI, if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS) return false; - for (auto &I : instructions(F)) { - switch (I.getOpcode()) { - case Instruction::FRem: { - Type *Ty = I.getType(); - // TODO: This pass doesn't handle scalable vectors. - if (Ty->isScalableTy()) - continue; - - if (targetSupportsFrem(TLI, Ty) || - !FRemExpander::canExpandType(Ty->getScalarType())) - continue; - - Replace.push_back(&I); - Modified = true; + for (auto It = inst_begin(&F), End = inst_end(F); It != End;) { + Instruction &I = *It++; + Type *Ty = I.getType(); + // TODO: This pass doesn't handle scalable vectors. + if (Ty->isScalableTy()) + continue; + switch (I.getOpcode()) { + case Instruction::FRem: + if (!targetSupportsFrem(TLI, Ty) && + FRemExpander::canExpandType(Ty->getScalarType())) { + addToWorklist(I, Worklist); + Modified = true; + } break; - } case Instruction::FPToUI: case Instruction::FPToSI: { - // TODO: This pass doesn't handle scalable vectors. - if (I.getOperand(0)->getType()->isScalableTy()) - continue; - - auto *IntTy = cast<IntegerType>(I.getType()->getScalarType()); + auto *IntTy = cast<IntegerType>(Ty->getScalarType()); if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth) continue; - if (I.getOperand(0)->getType()->isVectorTy()) - ReplaceVector.push_back(&I); - else - Replace.push_back(&I); + addToWorklist(I, Worklist); Modified = true; break; } case Instruction::UIToFP: case Instruction::SIToFP: { - // TODO: This pass doesn't handle scalable vectors. - if (I.getOperand(0)->getType()->isScalableTy()) - continue; - auto *IntTy = cast<IntegerType>(I.getOperand(0)->getType()->getScalarType()); if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth) continue; - if (I.getOperand(0)->getType()->isVectorTy()) - ReplaceVector.push_back(&I); - else - Replace.push_back(&I); - Modified = true; + addToWorklist(I, Worklist); break; } default: @@ -1060,16 +1043,8 @@ static bool runImpl(Function &F, const TargetLowering &TLI, } } - while (!ReplaceVector.empty()) { - Instruction *I = ReplaceVector.pop_back_val(); - scalarize(I, Replace); - } - - if (Replace.empty()) - return false; - - while (!Replace.empty()) { - Instruction *I = Replace.pop_back_val(); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); if (I->getOpcode() == Instruction::FRem) { auto SQ = [&]() -> std::optional<SimplifyQuery> { if (AC) { diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp index 33734b8..bb8d2cb 100644 --- a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp @@ -90,7 +90,7 @@ void MapperJITLinkMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G, auto TotalSize = Seg.ContentSize + Seg.ZeroFillSize; Seg.Addr = NextSegAddr; - Seg.WorkingMem = Mapper->prepare(NextSegAddr, TotalSize); + Seg.WorkingMem = Mapper->prepare(G, NextSegAddr, TotalSize); NextSegAddr += alignTo(TotalSize, Mapper->getPageSize()); diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp index ea3b22a..7b327af 100644 --- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp @@ -58,7 +58,8 @@ void InProcessMemoryMapper::reserve(size_t NumBytes, ExecutorAddrRange(ExecutorAddr::fromPtr(MB.base()), MB.allocatedSize())); } -char *InProcessMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) { +char *InProcessMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr, + size_t ContentSize) { return Addr.toPtr<char *>(); } @@ -324,7 +325,8 @@ void SharedMemoryMapper::reserve(size_t NumBytes, #endif } -char *SharedMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) { +char *SharedMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr, + size_t ContentSize) { auto R = Reservations.upper_bound(Addr); assert(R != Reservations.begin() && "Attempt to prepare unreserved range"); R--; diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp index 7a8c8ad..ed1f750 100644 --- a/llvm/lib/Object/XCOFFObjectFile.cpp +++ b/llvm/lib/Object/XCOFFObjectFile.cpp @@ -1568,7 +1568,7 @@ uint8_t XCOFFTracebackTable::getLanguageID() const { } bool XCOFFTracebackTable::isGlobalLinkage() const { - return GETBITWITHMASK(0, IsGlobaLinkageMask); + return GETBITWITHMASK(0, IsGlobalLinkageMask); } bool XCOFFTracebackTable::isOutOfLineEpilogOrPrologue() const { diff --git a/llvm/lib/Support/VirtualOutputBackends.cpp b/llvm/lib/Support/VirtualOutputBackends.cpp index d6d7b87..de59b8a 100644 --- a/llvm/lib/Support/VirtualOutputBackends.cpp +++ b/llvm/lib/Support/VirtualOutputBackends.cpp @@ -498,7 +498,7 @@ Error OnDiskOutputFile::keep() { // Someone else owns the lock on this file, wait. switch (Lock.waitForUnlockFor(std::chrono::seconds(256))) { case WaitForUnlockResult::Success: - LLVM_FALLTHROUGH; + [[fallthrough]]; case WaitForUnlockResult::OwnerDied: { continue; // try again to get the lock. } diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index f110558..7e03b97 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() { } bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes); - // Assume we can't combine the last pop with the sp restore. - bool CombineAfterCSRBump = false; + + unsigned ProloguePopSize = PrologueSaveSize; if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack + // that needs to be popped until we reach the start of the SVE save area. + // The "FixedObject" stack occurs after the SVE area and must be popped + // later. + ProloguePopSize -= FixedObject; AfterCSRPopSize += FixedObject; - } else if (!CombineSPBump && PrologueSaveSize != 0) { + } + + // Assume we can't combine the last pop with the sp restore. + if (!CombineSPBump && ProloguePopSize != 0) { MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION || - AArch64InstrInfo::isSEHInstruction(*Pop)) + AArch64InstrInfo::isSEHInstruction(*Pop) || + (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord && + isPartOfSVECalleeSaves(Pop))) Pop = std::prev(Pop); // Converting the last ldp to a post-index ldp is valid only if the last // ldp's offset is 0. @@ -1377,18 +1387,27 @@ void AArch64EpilogueEmitter::emitEpilogue() { // may clobber), convert it to a post-index ldp. if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) { convertCalleeSaveRestoreToSPPrePostIncDec( - Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy, - PrologueSaveSize); + Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy, + ProloguePopSize); + } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { + MachineBasicBlock::iterator AfterLastPop = std::next(Pop); + if (AArch64InstrInfo::isSEHInstruction(*AfterLastPop)) + ++AfterLastPop; + // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate + // callee-save non-SVE registers to move the stack pointer to the start of + // the SVE area. + emitFrameOffset(MBB, AfterLastPop, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(ProloguePopSize), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, + &HasWinCFI); } else { - // If not, make sure to emit an add after the last ldp. + // Otherwise, make sure to emit an add after the last ldp. // We're doing this by transferring the size to be restored from the // adjustment *before* the CSR pops to the adjustment *after* the CSR // pops. - AfterCSRPopSize += PrologueSaveSize; - CombineAfterCSRBump = true; + AfterCSRPopSize += ProloguePopSize; } } - // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1419,6 +1438,17 @@ void AArch64EpilogueEmitter::emitEpilogue() { --SEHEpilogueStartI; } + // Determine the ranges of SVE callee-saves. This is done before emitting any + // code at the end of the epilogue (for Swift async), which can get in the way + // of finding SVE callee-saves with CalleeSavesAboveFrameRecord. + auto [PPR, ZPR] = getSVEStackFrameSizes(); + auto [PPRRange, ZPRRange] = partitionSVECS( + MBB, + SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord + ? MBB.getFirstTerminator() + : FirstGPRRestoreI, + PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true); + if (HasFP && AFI->hasSwiftAsyncContext()) emitSwiftAsyncContextFramePointer(EpilogueEndI, DL); @@ -1441,14 +1471,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); - auto [PPR, ZPR] = getSVEStackFrameSizes(); - auto [PPRRange, ZPRRange] = partitionSVECS( - MBB, - SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord - ? MBB.getFirstTerminator() - : FirstGPRRestoreI, - PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true); - StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; StackOffset SVEStackSize = SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize; @@ -1467,16 +1489,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { NeedsWinCFI, &HasWinCFI); } - // Deallocate callee-save non-SVE registers. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - - // Deallocate fixed objects. - emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed(FixedObject), TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); - // Deallocate callee-save SVE registers. emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false, @@ -1619,7 +1631,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI, - StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0)); + StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore)); } } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 12ddf47..53b00e8 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -273,7 +273,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { EpilogueVectorizationMinVF = 8; MaxInterleaveFactor = 4; ScatterOverhead = 13; - LLVM_FALLTHROUGH; + [[fallthrough]]; case NeoverseN2: case NeoverseN3: PrefFunctionAlignment = Align(16); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 5e27b37..6dcbced 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1019,7 +1019,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { // SMEM and VMEM operations. So there will never be // outstanding address translations for both SMEM and // VMEM at the same time. - setScoreLB(T, CurrScore - 1); + setScoreLB(T, getScoreUB(T) - 1); PendingEvents &= ~(1 << OtherEvent); } for (const MachineOperand &Op : Inst.all_uses()) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index eac9fd4..27e5ee9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3726,6 +3726,23 @@ def : GCNPat < } // End foreach Ty = ... } // End AddedComplexity = 1 +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat< + (i32 (DivergentBinFrag<or> + (i32 (zext i16:$src_lo)), + (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))) + )), + (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16) +>; +def : GCNPat< + (i32 (DivergentBinFrag<or> + (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))), + (i32 (zext i16:$src_lo)) + )), + (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16) +>; +} + let True16Predicate = UseRealTrue16Insts in def : GCNPat < (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))), diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index be1c883..ebd2e7e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -2356,7 +2356,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); - LLVM_FALLTHROUGH; + [[fallthrough]]; } case AMDGPU::SI_SPILL_V1024_SAVE: case AMDGPU::SI_SPILL_V512_SAVE: @@ -2446,7 +2446,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask)); - LLVM_FALLTHROUGH; + [[fallthrough]]; } case AMDGPU::SI_SPILL_V16_RESTORE: case AMDGPU::SI_SPILL_V32_RESTORE: diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 7ddf996..f7deeaf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -632,7 +632,7 @@ SDValue LoongArchTargetLowering::lowerConstantFP(SDValue Op, case MVT::f32: { SDValue NewVal = DAG.getConstant(INTVal, DL, MVT::i32); if (Subtarget.is64Bit()) - NewVal = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, NewVal); + NewVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, NewVal); return DAG.getNode(Subtarget.is64Bit() ? LoongArchISD::MOVGR2FR_W_LA64 : LoongArchISD::MOVGR2FR_W, DL, VT, NewVal); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a1fb665..272c21f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -233,7 +233,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::v2i8: case MVT::v2i64: case MVT::v2f64: @@ -248,7 +248,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // global and the target supports 256-bit loads/stores. if (!CanLowerTo256Bit) return std::nullopt; - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::v2i16: // <1 x i16x2> case MVT::v2f16: // <1 x f16x2> case MVT::v2bf16: // <1 x bf16x2> @@ -270,7 +270,7 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::v2f32: // <1 x f32x2> case MVT::v4f32: // <2 x f32x2> case MVT::v2i32: // <1 x i32x2> @@ -6749,7 +6749,7 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::BinOp::Xchg: if (BitWidth == 128) return AtomicExpansionKind::None; - LLVM_FALLTHROUGH; + [[fallthrough]]; case AtomicRMWInst::BinOp::And: case AtomicRMWInst::BinOp::Or: case AtomicRMWInst::BinOp::Xor: diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 023fd14..bcb3f50 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2404,7 +2404,7 @@ void PPCAIXAsmPrinter::emitTracebackTable() { << static_cast<unsigned>(((V) & (TracebackTable::Field##Mask)) >> \ (TracebackTable::Field##Shift)) - GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobaLinkage); + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobalLinkage); GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsOutOfLineEpilogOrPrologue); EmitComment(); diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 21dbb7c..4b54231 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1688,7 +1688,7 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, (1 << 25) - 1); // HACK: See comment before `BareSymbolQC_E_LI` in RISCVInstrInfoXqci.td. case Match_InvalidBareSymbolQC_E_LI: - LLVM_FALLTHROUGH; + [[fallthrough]]; // END HACK case Match_InvalidBareSImm32: return generateImmOutOfRangeError(Operands, ErrorInfo, diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index 27fba34..100f1ec 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -1164,14 +1164,13 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, I.getOpcode() == TargetOpcode::G_USUBO) && "unexpected instruction"); - const Register DstReg = I.getOperand(0).getReg(); - const Register CarryOutReg = I.getOperand(1).getReg(); - const Register Op0Reg = I.getOperand(2).getReg(); - const Register Op1Reg = I.getOperand(3).getReg(); - bool IsSub = I.getOpcode() == TargetOpcode::G_USUBE || - I.getOpcode() == TargetOpcode::G_USUBO; - bool HasCarryIn = I.getOpcode() == TargetOpcode::G_UADDE || - I.getOpcode() == TargetOpcode::G_USUBE; + auto &CarryMI = cast<GAddSubCarryOut>(I); + + const Register DstReg = CarryMI.getDstReg(); + const Register CarryOutReg = CarryMI.getCarryOutReg(); + const Register Op0Reg = CarryMI.getLHSReg(); + const Register Op1Reg = CarryMI.getRHSReg(); + bool IsSub = CarryMI.isSub(); const LLT DstTy = MRI.getType(DstReg); assert(DstTy.isScalar() && "selectUAddSub only supported for scalar types"); @@ -1207,14 +1206,15 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, llvm_unreachable("selectUAddSub unsupported type."); } - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB); + const RegisterBank &CarryRB = *RBI.getRegBank(CarryOutReg, MRI, TRI); + const TargetRegisterClass *CarryRC = + getRegClass(MRI.getType(CarryOutReg), CarryRB); unsigned Opcode = IsSub ? OpSUB : OpADD; // G_UADDE/G_USUBE - find CarryIn def instruction. - if (HasCarryIn) { - Register CarryInReg = I.getOperand(4).getReg(); + if (auto CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) { + Register CarryInReg = CarryInMI->getCarryInReg(); MachineInstr *Def = MRI.getVRegDef(CarryInReg); while (Def->getOpcode() == TargetOpcode::G_TRUNC) { CarryInReg = Def->getOperand(1).getReg(); @@ -1227,11 +1227,12 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, Def->getOpcode() == TargetOpcode::G_USUBE || Def->getOpcode() == TargetOpcode::G_USUBO) { // carry set by prev ADD/SUB. - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), - X86::EFLAGS) - .addReg(CarryInReg); - if (!RBI.constrainGenericRegister(CarryInReg, *DstRC, MRI)) + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::CMP8ri)) + .addReg(CarryInReg) + .addImm(1); + + if (!RBI.constrainGenericRegister(CarryInReg, *CarryRC, MRI)) return false; Opcode = IsSub ? OpSBB : OpADC; @@ -1250,11 +1251,11 @@ bool X86InstructionSelector::selectUAddSub(MachineInstr &I, .addReg(Op0Reg) .addReg(Op1Reg); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg) - .addReg(X86::EFLAGS); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), CarryOutReg) + .addImm(X86::COND_B); if (!constrainSelectedInstRegOperands(Inst, TII, TRI, RBI) || - !RBI.constrainGenericRegister(CarryOutReg, *DstRC, MRI)) + !RBI.constrainGenericRegister(CarryOutReg, *CarryRC, MRI)) return false; I.eraseFromParent(); diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 11ef721..28fa2cd 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -194,11 +194,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .scalarize(0); getActionDefinitionsBuilder({G_UADDE, G_UADDO, G_USUBE, G_USUBO}) - .legalFor({{s8, s1}, {s16, s1}, {s32, s1}}) - .legalFor(Is64Bit, {{s64, s1}}) + .legalFor({{s8, s8}, {s16, s8}, {s32, s8}}) + .legalFor(Is64Bit, {{s64, s8}}) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) - .clampScalar(1, s1, s1) + .clampScalar(1, s8, s8) .scalarize(0); // integer multiply diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp index 080a9c0..4e73070 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp @@ -84,11 +84,11 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, case Xtensa::CCOMPARE0: if (FeatureBits[Xtensa::FeatureTimers1]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::CCOMPARE1: if (FeatureBits[Xtensa::FeatureTimers2]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::CCOMPARE2: if (FeatureBits[Xtensa::FeatureTimers3]) return true; @@ -107,37 +107,37 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits, case Xtensa::EXCSAVE1: case Xtensa::EXCVADDR: return FeatureBits[Xtensa::FeatureException]; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC2: case Xtensa::EPS2: case Xtensa::EXCSAVE2: if (FeatureBits[Xtensa::FeatureHighPriInterrupts]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC3: case Xtensa::EPS3: case Xtensa::EXCSAVE3: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel3]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC4: case Xtensa::EPS4: case Xtensa::EXCSAVE4: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel4]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC5: case Xtensa::EPS5: case Xtensa::EXCSAVE5: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel5]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC6: case Xtensa::EPS6: case Xtensa::EXCSAVE6: if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel6]) return true; - LLVM_FALLTHROUGH; + [[fallthrough]]; case Xtensa::EPC7: case Xtensa::EPS7: case Xtensa::EXCSAVE7: diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp index cea246e..950bb2b 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -258,7 +258,7 @@ static std::string computePowerDataLayout(const Triple &T) { static std::string computeAMDDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. - return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + return "e-m:e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; } @@ -268,7 +268,7 @@ static std::string computeAMDDataLayout(const Triple &TT) { // (address space 7), and 128-bit non-integral buffer resourcees (address // space 8) which cannot be non-trivilally accessed by LLVM memory operations // like getelementptr. - return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" + return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-" "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-" "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 82ac903..3f11cae 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1690,6 +1690,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign( // 2) (fp_binop ({s|u}itofp x), FpC) // -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC))) Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) { + // Don't perform the fold on vectors, as the integer operation may be much + // more expensive than the float operation in that case. + if (BO.getType()->isVectorTy()) + return nullptr; + std::array<Value *, 2> IntOps = {nullptr, nullptr}; Constant *Op1FpC = nullptr; // Check for: diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 66a2c76..09db464 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -2626,7 +2626,7 @@ void ObjCARCOpt::OptimizeAutoreleasePools(Function &F) { case ARCInstKind::Call: if (!MayAutorelease(cast<CallBase>(Inst))) break; - LLVM_FALLTHROUGH; + [[fallthrough]]; case ARCInstKind::Autorelease: case ARCInstKind::AutoreleaseRV: case ARCInstKind::FusedRetainAutorelease: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 7d376c3..fdfff16 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1642,6 +1642,19 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, return false; } + // The latch must be terminated by a BranchInst. + BasicBlock *Latch = Lp->getLoopLatch(); + if (Latch && !isa<BranchInst>(Latch->getTerminator())) { + reportVectorizationFailure( + "The loop latch terminator is not a BranchInst", + "loop control flow is not understood by vectorizer", "CFGNotUnderstood", + ORE, TheLoop); + if (DoExtraAnalysis) + Result = false; + else + return false; + } + return Result; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2388375..a6f4bec 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5291,19 +5291,19 @@ private: // data. for (TreeEntry *TE : Entries) { // Check if the user is commutative. - // The commutatives are handled later, as their oeprands can be + // The commutatives are handled later, as their operands can be // reordered. // Same applies even for non-commutative cmps, because we can invert // their predicate potentially and, thus, reorder the operands. bool IsCommutativeUser = ::isCommutative(User) || ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User); - EdgeInfo EI(TE, U.getOperandNo()); if (!IsCommutativeUser && !isa<CmpInst>(User)) { unsigned &OpCnt = OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); + EdgeInfo EI(TE, U.getOperandNo()); if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) - return false; + continue; // Found copyable operand - continue. ++OpCnt; continue; @@ -5312,33 +5312,38 @@ private: .first->getSecond(); } } - // Check the commutative/cmp entries. - if (!PotentiallyReorderedEntriesCount.empty()) { - for (auto &P : PotentiallyReorderedEntriesCount) { - auto *It = find(P.first->Scalars, User); - assert(It != P.first->Scalars.end() && - "User is not in the tree entry"); - int Lane = std::distance(P.first->Scalars.begin(), It); - assert(Lane >= 0 && "Lane is not found"); - if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty()) - Lane = P.first->ReorderIndices[Lane]; - assert(Lane < static_cast<int>(P.first->Scalars.size()) && - "Couldn't find extract lane"); - SmallVector<unsigned> OpIndices; - for (unsigned OpIdx : - seq<unsigned>(::getNumberOfPotentiallyCommutativeOps( - P.first->getMainOp()))) { - if (P.first->getOperand(OpIdx)[Lane] == Op && - getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) - --P.getSecond(); - } - } - return all_of(PotentiallyReorderedEntriesCount, + if (PotentiallyReorderedEntriesCount.empty()) + return all_of(OrderedEntriesCount, [&](const std::pair<const TreeEntry *, unsigned> &P) { - return P.second == NumOps - 1; + return P.second == NumOps; }); - } - return true; + // Check the commutative/cmp entries. + for (auto &P : PotentiallyReorderedEntriesCount) { + auto *It = find(P.first->Scalars, User); + assert(It != P.first->Scalars.end() && "User is not in the tree entry"); + int Lane = std::distance(P.first->Scalars.begin(), It); + assert(Lane >= 0 && "Lane is not found"); + if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty()) + Lane = P.first->ReorderIndices[Lane]; + assert(Lane < static_cast<int>(P.first->Scalars.size()) && + "Couldn't find extract lane"); + SmallVector<unsigned> OpIndices; + for (unsigned OpIdx : + seq<unsigned>(::getNumberOfPotentiallyCommutativeOps( + P.first->getMainOp()))) { + if (P.first->getOperand(OpIdx)[Lane] == Op && + getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) + --P.getSecond(); + } + } + return all_of(PotentiallyReorderedEntriesCount, + [&](const std::pair<const TreeEntry *, unsigned> &P) { + return P.second == NumOps - 1; + }) && + all_of(OrderedEntriesCount, + [&](const std::pair<const TreeEntry *, unsigned> &P) { + return P.second == NumOps; + }); } SmallVector<ScheduleCopyableData *> @@ -20071,7 +20076,9 @@ Value *BoUpSLP::vectorizeTree( // The is because source vector that supposed to feed this gather node was // inserted at the end of the block [after stab instruction]. So we need // to adjust insertion point again to the end of block. - if (isa<PHINode>(UserI)) { + if (isa<PHINode>(UserI) || + (TE->UserTreeIndex.UserTE->hasState() && + TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) { // Insert before all users. Instruction *InsertPt = PrevVec->getParent()->getTerminator(); for (User *U : PrevVec->users()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1fea068..0101942 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -635,9 +635,9 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) { const VPRecipeBase *R = &VPBB->back(); bool IsSwitch = isa<VPInstruction>(R) && cast<VPInstruction>(R)->getOpcode() == Instruction::Switch; - bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) || - match(R, m_BranchOnCond(m_VPValue())) || - match(R, m_BranchOnCount(m_VPValue(), m_VPValue())); + bool IsCondBranch = + isa<VPBranchOnMaskRecipe>(R) || + match(R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount())); (void)IsCondBranch; (void)IsSwitch; if (VPBB->getNumSuccessors() == 2 || diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 81deba2..3e65d42 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -433,8 +433,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB, // We are about to replace the branch to exit the region. Remove the original // BranchOnCond, if there is any. DebugLoc LatchDL = DL; - if (!LatchVPBB->empty() && - match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) { + if (!LatchVPBB->empty() && match(&LatchVPBB->back(), m_BranchOnCond())) { LatchDL = LatchVPBB->getTerminator()->getDebugLoc(); LatchVPBB->getTerminator()->eraseFromParent(); } @@ -875,8 +874,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { Plan.getVectorLoopRegion()->getEntryBasicBlock())) { auto *VPBB = cast<VPBasicBlock>(VPB); for (auto &R : *VPBB) { - if (R.mayWriteToMemory() && - !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + if (R.mayWriteToMemory() && !match(&R, m_BranchOnCount())) return false; } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 555efea..b42b049 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -344,6 +344,10 @@ m_Freeze(const Op0_t &Op0) { return m_VPInstruction<Instruction::Freeze>(Op0); } +inline VPInstruction_match<VPInstruction::BranchOnCond> m_BranchOnCond() { + return m_VPInstruction<VPInstruction::BranchOnCond>(); +} + template <typename Op0_t> inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t> m_BranchOnCond(const Op0_t &Op0) { @@ -374,6 +378,10 @@ m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2); } +inline VPInstruction_match<VPInstruction::BranchOnCount> m_BranchOnCount() { + return m_VPInstruction<VPInstruction::BranchOnCount>(); +} + template <typename Op0_t, typename Op1_t> inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t> m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8e916772..2368d18 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1154,7 +1154,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, case VPInstruction::ExtractPenultimateElement: if (VF == ElementCount::getScalable(1)) return InstructionCost::getInvalid(); - LLVM_FALLTHROUGH; + [[fallthrough]]; default: // TODO: Compute cost other VPInstructions once the legacy cost model has // been retired. @@ -2855,7 +2855,7 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, case ExpressionTypes::ExtNegatedMulAccReduction: assert(Opcode == Instruction::Add && "Unexpected opcode"); Opcode = Instruction::Sub; - LLVM_FALLTHROUGH; + [[fallthrough]]; case ExpressionTypes::ExtMulAccReduction: { return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9bb8820..af755ca 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1658,7 +1658,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, auto *Term = &ExitingVPBB->back(); VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); - if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || + if (match(Term, m_BranchOnCount()) || match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( m_VPValue(), m_VPValue(), m_VPValue()))))) { // Try to simplify the branch condition if TC <= VF * UF when the latch @@ -3398,9 +3398,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBuilder Builder(LatchVPBB->getTerminator()); VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0]; - assert( - match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) && - "Terminator must be be BranchOnCond"); + assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) && + "Terminator must be be BranchOnCond"); VPValue *CondOfEarlyExitingVPBB = EarlyExitingVPBB->getTerminator()->getOperand(0); auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB @@ -4009,8 +4008,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VFMinVal = VF.getKnownMinValue(); SmallVector<VPInterleaveRecipe *> StoreGroups; for (auto &R : *VectorLoop->getEntryBasicBlock()) { - if (isa<VPCanonicalIVPHIRecipe>(&R) || - match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount())) continue; if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 5e7f19f..1c4adfc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -259,8 +259,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, /// Handle non-header-phi recipes. void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { - if (match(&R, m_BranchOnCond(m_VPValue())) || - match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + if (match(&R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()))) return; if (auto *VPI = dyn_cast<VPInstruction>(&R)) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 013ea2e..752e03d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -24,6 +24,7 @@ #define DEBUG_TYPE "loop-vectorize" using namespace llvm; +using namespace VPlanPatternMatch; namespace { class VPlanVerifier { @@ -198,7 +199,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { } // EVLIVIncrement is only used by EVLIV & BranchOnCount. // Having more than two users is unexpected. - using namespace llvm::VPlanPatternMatch; if (I->getOpcode() != VPInstruction::Broadcast && I->getNumUsers() != 1 && (I->getNumUsers() != 2 || @@ -479,8 +479,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) { } auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end())); - if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && - LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + if (!match(LastInst, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()))) { errs() << "VPlan vector loop exit must end with BranchOnCount or " "BranchOnCond VPInstruction\n"; return false; diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir index 5933c5d..b8302e6 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir @@ -380,10 +380,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 - ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1) - ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1) + ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16 ; CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 0 :: (load (s16) from %stack.4) ; CHECK-NEXT: frame-destroy SEH_SavePReg 4, 0 ; CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 1 :: (load (s16) from %stack.3) @@ -430,10 +428,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 - ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1) - ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1) + ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 0 ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3) @@ -557,10 +553,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 ; CHECK-NEXT: $x21, $lr = frame-destroy LDPXi $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3) ; CHECK-NEXT: frame-destroy SEH_SaveRegP 21, 30, 16 - ; CHECK-NEXT: $x19, $x20 = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.4), (load (s64) from %stack.5) - ; CHECK-NEXT: frame-destroy SEH_SaveRegP 19, 20, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 + ; CHECK-NEXT: early-clobber $sp, $x19, $x20 = frame-destroy LDPXpost $sp, 4 :: (load (s64) from %stack.4), (load (s64) from %stack.5) + ; CHECK-NEXT: frame-destroy SEH_SaveRegP_X 19, 20, -32 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.21) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2 ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.20) @@ -745,10 +739,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 ; CHECK-NEXT: frame-destroy SEH_SetFP - ; CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.2), (load (s64) from %stack.3) - ; CHECK-NEXT: frame-destroy SEH_SaveFPLR 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3) + ; CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2 ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18) @@ -869,10 +861,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 7, implicit $vg ; CHECK-NEXT: frame-destroy SEH_AllocZ 7 - ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.6) - ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.6) + ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.8) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 1 ; CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.7) diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll index 53ac934..3ba4a1c 100644 --- a/llvm/test/CodeGen/AArch64/win-sve.ll +++ b/llvm/test/CodeGen/AArch64/win-sve.ll @@ -75,10 +75,8 @@ define i32 @f(<vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -234,10 +232,8 @@ define void @f2(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -384,10 +380,8 @@ define void @f3(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -538,10 +532,8 @@ define void @f4(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -702,10 +694,8 @@ define void @f5(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -860,10 +850,10 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: stur x0, [x8, #16] ; CHECK-NEXT: addvl x8, x29, #18 ; CHECK-NEXT: ldr x1, [x8, #32] -; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .Ltmp0: // EH_LABEL ; CHECK-NEXT: add x0, x19, #0 ; CHECK-NEXT: bl g6 -; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: .Ltmp1: // EH_LABEL ; CHECK-NEXT: // %bb.1: // %invoke.cont ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: add sp, sp, #64 @@ -872,10 +862,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -932,8 +920,6 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: .seh_save_preg p14, 10 ; CHECK-NEXT: ldr p15, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: .seh_save_preg p15, 11 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: .seh_allocz 18 ; CHECK-NEXT: add sp, sp, #16 @@ -1024,10 +1010,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1144,10 +1128,8 @@ define void @f8(<vscale x 2 x i64> %v) { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1196,14 +1178,10 @@ define void @f9(<vscale x 2 x i64> %v, ...) { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: .seh_stackalloc 64 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .seh_allocz 1 ; CHECK-NEXT: add sp, sp, #64 @@ -1301,10 +1279,8 @@ define void @f10(i64 %n, <vscale x 2 x i64> %x) "frame-pointer"="all" { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1390,10 +1366,8 @@ define i32 @f11(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str d0, [sp, #8] ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1431,10 +1405,8 @@ define i32 @f12(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .seh_allocz 1 -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1475,10 +1447,8 @@ define i32 @f13(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" { ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 32 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1521,10 +1491,8 @@ define i32 @f14(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" { ; CHECK-NEXT: .seh_allocz 1 ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 32 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1572,10 +1540,8 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1594,3 +1560,53 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) { store i32 %i, ptr %a ret void } + +declare ptr @llvm.swift.async.context.addr() + +define void @f16(ptr swiftasync %ctx, <vscale x 2 x i64> %foo) { +; CHECK-LABEL: f16: +; CHECK: .seh_proc f16 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: orr x29, x29, #0x1000000000000000 +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .seh_allocz 1 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_zreg z8, 0 +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 8 +; CHECK-NEXT: str x22, [sp] +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: add x29, sp, #8 +; CHECK-NEXT: .seh_add_fp 8 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x8, [x22] +; CHECK-NEXT: stur x8, [x29, #-8] +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 8 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_zreg z8, 0 +; CHECK-NEXT: and x29, x29, #0xefffffffffffffff +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .seh_allocz 1 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc + tail call void asm sideeffect "", "~{z8}"() + %1 = load ptr, ptr %ctx, align 8 + %2 = tail call ptr @llvm.swift.async.context.addr() + store ptr %1, ptr %2, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 549af87..a43bfb5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; CI-NEXT: s_cbranch_vccz .LBB9_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else20 ; CI-NEXT: s_and_b32 s2, s0, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB9_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute19 ; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 @@ -1083,10 +1083,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: .LBB9_5: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_7 ; CI-NEXT: .LBB9_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1125,7 +1125,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; CI-NEXT: s_cbranch_vccz .LBB9_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: s_and_b32 s4, s2, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s4, s4, 1 ; CI-NEXT: s_cmp_lg_u32 s4, 0 ; CI-NEXT: s_cbranch_scc1 .LBB9_16 -; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 @@ -1161,10 +1161,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: .LBB9_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_15 ; CI-NEXT: .LBB9_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; VI-NEXT: s_cbranch_vccz .LBB9_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else20 ; VI-NEXT: s_and_b32 s2, s0, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB9_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute19 ; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; VI-NEXT: v_ldexp_f32 v1, v3, 1 @@ -1273,10 +1273,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: .LBB9_5: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_7 ; VI-NEXT: .LBB9_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1315,7 +1315,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; VI-NEXT: s_cbranch_vccz .LBB9_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_and_b32 s3, s4, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s3, s3, 1 ; VI-NEXT: s_cmp_lg_u32 s3, 0 ; VI-NEXT: s_cbranch_scc1 .LBB9_16 -; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: ; %bb.11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; VI-NEXT: v_ldexp_f32 v2, v4, 1 @@ -1351,10 +1351,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: .LBB9_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1370,7 +1370,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_15 ; VI-NEXT: .LBB9_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; CI-NEXT: s_cbranch_vccz .LBB10_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else86 ; CI-NEXT: s_and_b32 s0, s4, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1436,7 +1436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s0, s0, 1 ; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute85 ; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 @@ -1461,10 +1461,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: .LBB10_5: ; %frem.loop_body93 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_7 ; CI-NEXT: .LBB10_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1503,7 +1503,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; CI-NEXT: s_cbranch_vccz .LBB10_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else53 ; CI-NEXT: s_and_b32 s1, s6, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1514,7 +1514,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s1, s1, 1 ; CI-NEXT: s_cmp_lg_u32 s1, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_16 -; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: ; %bb.11: ; %frem.compute52 ; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 @@ -1539,10 +1539,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: .LBB10_13: ; %frem.loop_body60 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1558,7 +1558,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_15 ; CI-NEXT: .LBB10_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1579,7 +1579,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB10_18 -; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: ; %bb.17: ; %frem.else20 ; CI-NEXT: s_and_b32 s1, s5, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 ; CI-NEXT: v_mov_b32_e32 v2, s1 @@ -1590,7 +1590,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s1, s1, 1 ; CI-NEXT: s_cmp_lg_u32 s1, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_24 -; CI-NEXT: ; %bb.19: ; %frem.compute52 +; CI-NEXT: ; %bb.19: ; %frem.compute19 ; CI-NEXT: v_frexp_mant_f32_e32 v5, v3 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 ; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1 @@ -1615,10 +1615,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 ; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_22 -; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7 ; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: .LBB10_21: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v7, v6 ; CI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -1634,7 +1634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_23 ; CI-NEXT: .LBB10_22: ; CI-NEXT: v_mov_b32_e32 v7, v6 -; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 ; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 ; CI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr3 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 ; CI-NEXT: s_cbranch_vccz .LBB10_26 -; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: s_and_b32 s1, s7, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s1, s1, 1 ; CI-NEXT: s_cmp_lg_u32 s1, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_32 -; CI-NEXT: ; %bb.27: ; %frem.compute85 +; CI-NEXT: ; %bb.27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v6, v4 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 ; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1 @@ -1693,10 +1693,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 ; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_30 -; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8 ; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 -; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: .LBB10_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v8, v7 ; CI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_31 ; CI-NEXT: .LBB10_30: ; CI-NEXT: v_mov_b32_e32 v8, v7 -; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5 ; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 ; CI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -1791,7 +1791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; VI-NEXT: s_cbranch_vccz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else86 ; VI-NEXT: s_and_b32 s0, s8, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1802,7 +1802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute85 ; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; VI-NEXT: v_ldexp_f32 v1, v3, 1 @@ -1827,10 +1827,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: .LBB10_5: ; %frem.loop_body93 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1846,7 +1846,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_7 ; VI-NEXT: .LBB10_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1869,7 +1869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; VI-NEXT: s_cbranch_vccz .LBB10_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else53 ; VI-NEXT: s_and_b32 s0, s4, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; VI-NEXT: v_mov_b32_e32 v1, s0 @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_16 -; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: ; %bb.11: ; %frem.compute52 ; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; VI-NEXT: v_ldexp_f32 v2, v4, 1 @@ -1905,10 +1905,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: .LBB10_13: ; %frem.loop_body60 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_15 ; VI-NEXT: .LBB10_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1945,7 +1945,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; VI-NEXT: s_cbranch_vccz .LBB10_18 -; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: ; %bb.17: ; %frem.else20 ; VI-NEXT: s_and_b32 s0, s9, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_24 -; VI-NEXT: ; %bb.19: ; %frem.compute52 +; VI-NEXT: ; %bb.19: ; %frem.compute19 ; VI-NEXT: v_frexp_mant_f32_e32 v5, v3 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 ; VI-NEXT: v_ldexp_f32 v3, v5, 1 @@ -1981,10 +1981,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 ; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_22 -; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 -; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: .LBB10_21: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -2000,7 +2000,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_23 ; VI-NEXT: .LBB10_22: ; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 ; VI-NEXT: v_ldexp_f32 v4, v7, v4 ; VI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -2023,7 +2023,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 ; VI-NEXT: s_cbranch_vccz .LBB10_26 -; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_and_b32 s0, s12, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 ; VI-NEXT: v_mov_b32_e32 v3, s0 @@ -2034,7 +2034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_32 -; VI-NEXT: ; %bb.27: ; %frem.compute85 +; VI-NEXT: ; %bb.27: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e32 v6, v4 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 ; VI-NEXT: v_ldexp_f32 v4, v6, 1 @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 ; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_30 -; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8 ; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 -; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: .LBB10_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v8, v7 ; VI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -2078,7 +2078,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_31 ; VI-NEXT: .LBB10_30: ; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5 ; VI-NEXT: v_ldexp_f32 v5, v8, v5 ; VI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -2144,7 +2144,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| ; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_cbranch_vccz .LBB11_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: s_and_b32 s6, s2, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2156,7 +2156,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s6, s6, 1 ; CI-NEXT: s_cmp_lg_u32 s6, 0 ; CI-NEXT: s_cbranch_scc1 .LBB11_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 ; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 @@ -2181,10 +2181,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: .LBB11_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2200,7 +2200,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_7 ; CI-NEXT: .LBB11_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s6, 1 ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_cbranch_vccz .LBB11_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: s_and_b32 s6, s3, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v2, s5 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2231,7 +2231,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s6, s6, 1 ; CI-NEXT: s_cmp_lg_u32 s6, 0 ; CI-NEXT: s_cbranch_scc1 .LBB11_16 -; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5| ; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 ; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 @@ -2256,10 +2256,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: .LBB11_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_15 ; CI-NEXT: .LBB11_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2317,7 +2317,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_cbranch_vccz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: s_and_b32 s6, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2329,7 +2329,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s6, s6, 1 ; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc1 .LBB11_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; VI-NEXT: v_ldexp_f32 v1, v1, 1 ; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: .LBB11_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2373,7 +2373,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_7 ; VI-NEXT: .LBB11_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2392,7 +2392,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s6, 1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_cbranch_vccz .LBB11_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_and_b32 s6, s3, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2404,7 +2404,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s6, s6, 1 ; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc1 .LBB11_16 -; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: ; %bb.11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5| ; VI-NEXT: v_ldexp_f32 v2, v2, 1 ; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 @@ -2429,10 +2429,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: .LBB11_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2448,7 +2448,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_15 ; VI-NEXT: .LBB11_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2498,7 +2498,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| ; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_cbranch_vccz .LBB12_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else78 ; CI-NEXT: s_and_b32 s2, s4, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v1, s8 ; CI-NEXT: v_mov_b32_e32 v0, s4 @@ -2510,7 +2510,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute77 ; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8| ; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 ; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 @@ -2535,10 +2535,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: .LBB12_5: ; %frem.loop_body85 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_7 ; CI-NEXT: .LBB12_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_cbranch_vccz .LBB12_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else47 ; CI-NEXT: s_and_b32 s2, s5, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v2, s9 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_16 -; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: ; %bb.11: ; %frem.compute46 ; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 ; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 @@ -2610,10 +2610,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: .LBB12_13: ; %frem.loop_body54 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2629,7 +2629,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_15 ; CI-NEXT: .LBB12_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2648,7 +2648,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: s_cbranch_vccz .LBB12_18 -; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: ; %bb.17: ; %frem.else16 ; CI-NEXT: s_and_b32 s2, s6, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v3, s10 ; CI-NEXT: v_mov_b32_e32 v2, s6 @@ -2660,7 +2660,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_24 -; CI-NEXT: ; %bb.19: ; %frem.compute46 +; CI-NEXT: ; %bb.19: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10| ; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 ; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 @@ -2685,10 +2685,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 ; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_22 -; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7 ; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: .LBB12_21: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v7, v6 ; CI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -2704,7 +2704,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_23 ; CI-NEXT: .LBB12_22: ; CI-NEXT: v_mov_b32_e32 v7, v6 -; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 ; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 ; CI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -2723,7 +2723,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr3 ; CI-NEXT: s_cbranch_vccz .LBB12_26 -; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: s_and_b32 s2, s7, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v4, s11 ; CI-NEXT: v_mov_b32_e32 v3, s7 @@ -2735,7 +2735,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_32 -; CI-NEXT: ; %bb.27: ; %frem.compute77 +; CI-NEXT: ; %bb.27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11| ; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1 ; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 ; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_30 -; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8 ; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 -; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: .LBB12_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v8, v7 ; CI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -2779,7 +2779,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_31 ; CI-NEXT: .LBB12_30: ; CI-NEXT: v_mov_b32_e32 v8, v7 -; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5 ; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 ; CI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -2829,7 +2829,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_cbranch_vccz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else78 ; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2841,7 +2841,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute77 ; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8| ; VI-NEXT: v_ldexp_f32 v1, v1, 1 ; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 @@ -2866,10 +2866,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: .LBB12_5: ; %frem.loop_body85 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2885,7 +2885,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_7 ; VI-NEXT: .LBB12_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2904,7 +2904,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_cbranch_vccz .LBB12_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else47 ; VI-NEXT: s_and_b32 s2, s5, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v2, s9 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2916,7 +2916,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_16 -; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: ; %bb.11: ; %frem.compute46 ; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; VI-NEXT: v_ldexp_f32 v2, v2, 1 ; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 @@ -2941,10 +2941,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: .LBB12_13: ; %frem.loop_body54 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2960,7 +2960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_15 ; VI-NEXT: .LBB12_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: s_cbranch_vccz .LBB12_18 -; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: ; %bb.17: ; %frem.else16 ; VI-NEXT: s_and_b32 s2, s6, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v3, s10 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -2991,7 +2991,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_24 -; VI-NEXT: ; %bb.19: ; %frem.compute46 +; VI-NEXT: ; %bb.19: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10| ; VI-NEXT: v_ldexp_f32 v3, v3, 1 ; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 @@ -3016,10 +3016,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 ; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_22 -; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 -; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: .LBB12_21: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -3035,7 +3035,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_23 ; VI-NEXT: .LBB12_22: ; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 ; VI-NEXT: v_ldexp_f32 v4, v7, v4 ; VI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -3054,7 +3054,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_cbranch_vccz .LBB12_26 -; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_and_b32 s2, s7, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v4, s11 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -3066,7 +3066,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_32 -; VI-NEXT: ; %bb.27: ; %frem.compute77 +; VI-NEXT: ; %bb.27: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11| ; VI-NEXT: v_ldexp_f32 v4, v4, 1 ; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 @@ -3091,10 +3091,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 ; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_30 -; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8 ; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 -; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: .LBB12_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v8, v7 ; VI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_31 ; VI-NEXT: .LBB12_30: ; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5 ; VI-NEXT: v_ldexp_f32 v5, v8, v5 ; VI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -3169,7 +3169,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| ; CI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CI-NEXT: s_cbranch_vccz .LBB13_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| @@ -3187,7 +3187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB13_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| @@ -3210,10 +3210,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6 ; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7 -; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: .LBB13_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v7, v5 ; CI-NEXT: v_mov_b32_e32 v6, v4 @@ -3232,7 +3232,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: .LBB13_6: ; CI-NEXT: v_mov_b32_e32 v7, v5 ; CI-NEXT: v_mov_b32_e32 v6, v4 -; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9 ; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 ; CI-NEXT: s_mov_b32 s2, 0 @@ -3256,7 +3256,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CI-NEXT: s_cbranch_vccz .LBB13_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| @@ -3274,7 +3274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB13_16 -; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| @@ -3297,10 +3297,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8 ; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9 -; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: .LBB13_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mov_b32_e32 v8, v6 @@ -3319,7 +3319,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: .LBB13_14: ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mov_b32_e32 v8, v6 -; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11 ; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 ; CI-NEXT: s_mov_b32 s2, 0 @@ -3371,7 +3371,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_cbranch_vccz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| @@ -3389,7 +3389,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB13_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| @@ -3412,10 +3412,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6 ; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7 -; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: .LBB13_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 @@ -3434,7 +3434,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: .LBB13_6: ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9 ; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 ; VI-NEXT: s_mov_b32 s2, 0 @@ -3458,7 +3458,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; VI-NEXT: s_cbranch_vccz .LBB13_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| @@ -3476,7 +3476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB13_16 -; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: ; %bb.11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| @@ -3499,10 +3499,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 ; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8 ; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9 -; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: .LBB13_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mov_b32_e32 v8, v6 @@ -3521,7 +3521,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: .LBB13_14: ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11 ; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 ; VI-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index df9c97f..117af95 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -6551,271 +6551,205 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: @@ -15709,61 +15643,61 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -15778,121 +15712,123 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15903,215 +15839,179 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4 ; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -16133,433 +16033,329 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -42692,271 +42488,205 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: @@ -53003,61 +52733,61 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -53072,121 +52802,123 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -53197,215 +52929,179 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4 ; GFX11-TRUE16-NEXT: .LBB38_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -53427,433 +53123,329 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -78968,271 +78560,205 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: @@ -88136,61 +87662,61 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -88205,121 +87731,123 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -88330,215 +87858,179 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4 ; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -88560,433 +88052,329 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -113114,271 +112502,205 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: @@ -123405,61 +122727,61 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -123474,121 +122796,123 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -123599,215 +122923,179 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4 ; GFX11-TRUE16-NEXT: .LBB74_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -123829,433 +123117,329 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -161654,179 +160838,182 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:112 -; GFX11-TRUE16-NEXT: s_clause 0x18 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v98, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr140_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr155_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr154_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 @@ -161835,136 +161022,136 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v99 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v99 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v81 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v81 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v98 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v80 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[98:99] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[80:81] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v127.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v142.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v143.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v141.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v136.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v153.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v139.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v155.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v154.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v24.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v98.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v99.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v80.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v81.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 @@ -161980,7 +161167,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 @@ -162019,10 +161206,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v17, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v37, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v37, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 @@ -162036,97 +161223,101 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v135.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v33, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v17, v34, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v147.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v17, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v151, v17, v33 :: v_dual_and_b32 v18, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v18, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v33, v20, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v151.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v34 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v149, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v19, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v149.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v17, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v17, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v160.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v19, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v161.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v17, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v151.h ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v160, v17, v23 :: v_dual_lshlrev_b32 v21, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -162139,8 +161330,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v161, v19, v23 :: v_dual_lshlrev_b32 v22, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v163.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -162153,10 +161346,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v161.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v150.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -162169,10 +161361,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v49 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v49 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v162.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v165.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -162185,10 +161377,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v163.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v38 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[37:38] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -162201,14 +161393,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v99 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v51 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v81 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v167.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v81 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v53, v17, v24 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 @@ -162217,14 +161409,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v98 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v160.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v165.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v80 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v164.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v80 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_cndmask_b32 v52, v19, v24 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 @@ -162233,10 +161425,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v53 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v53 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v37 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v177.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v49 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 @@ -162249,10 +161441,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v167.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v17, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v17, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 @@ -162263,13 +161454,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v55 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v179.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v20, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v19, v21, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -162282,11 +161472,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v162.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v177.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v17, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v181, v17, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -162301,9 +161490,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v65 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v65 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v2, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v166.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v181.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v2, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v3 @@ -162313,13 +161502,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v17, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v179.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v51 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v164.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v67 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v1, v18, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v176.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v51 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v1, v18, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6 @@ -162330,13 +161519,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v183.h ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[48:49] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[37:38] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v67 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v2, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v53 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v53 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v50 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v2, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 @@ -162350,28 +161539,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v56, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v2, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v5, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v48 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v1, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v2, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v56.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v41.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v60, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v83, v1, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v71, v1, v8 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -162380,29 +161568,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v60.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v166.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v46.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[52:53] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v79, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v72, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v79.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v178.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v74, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -162410,40 +161598,40 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v76.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v176.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[82:83] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v182.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v74.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v72.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v2, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[54:55] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[70:71] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[54:55] ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v106, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v106.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v104, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v104.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v106, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[96:97] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v180.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v3, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v104.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v106.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[84:85] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 @@ -162452,8 +161640,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v178.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v127, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v136, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 @@ -162461,19 +161649,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v125, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v139, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v40.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v139.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v130, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 @@ -162481,11 +161669,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v125.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v127.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v102.l, v136.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v40.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v142, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 @@ -162494,389 +161682,322 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v142.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v141, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v153.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v2, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v42.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[112:113] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v143, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[130:131] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v155, v7, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[66:67] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[68:69] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[68:69] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[64:65] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v134, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v141.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[33:34] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v129 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v129 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v143.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v134 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v134 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v128 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v113 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[133:134] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[128:129] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[35:36] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v133 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v113 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v112 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v97 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v97 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v96 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v83 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v83 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v82 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v68 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v154.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v131 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v155.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v148 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v148 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v131 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v130 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[147:148] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v147 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v103 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v103 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v102 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v84 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v69 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v69 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v68 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v65 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v65 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v64 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v54 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v52 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v52 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v181.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v152.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v64.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v139.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v180.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v143.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v141.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v183.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v140.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v177.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v140.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v40.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v138.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v136.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v182.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v127.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v125.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v41.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v123.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v179.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v137.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v121.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v56.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v126.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v107.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v42.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v123.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v79.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v111.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v91.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v60.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v109.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v75.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v5, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v106.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v95.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v5, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v61.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v111.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v72.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v109.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v46.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v107.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v105.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v92.l +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v89.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v136.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v79.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v106.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v77.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v75.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v74.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v91.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v153.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v62.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v139.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v61.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v59.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v155.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v57.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v154.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v47.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v44.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v142.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v138.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v137.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v126.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v124.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h ; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v76.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v93.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v5, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v127.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v89.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v45.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v104.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v78.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v120.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v5, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v142.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v73.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v5, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v105.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v125.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v63.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v5, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v122.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v120.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v110.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v108.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v95.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v94.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v93.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v90.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v143.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v58.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v90.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v5, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v141.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v47.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v74.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v5, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v124.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v5, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v59.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v122.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v5, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v110.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v44.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v5, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v78.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v167.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v76.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v166.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v73.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v63.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v177.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v176.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v58.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h ; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v108.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v183.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v5, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v150.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v94.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v34.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v5, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v180.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v92.l -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v5, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v88.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v5, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v77.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v5, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v26.l, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v72.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v5, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v26.l, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v62.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v5, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v57.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v5, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v29.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v46.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v5, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.l, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v166.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v43.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v5, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v41.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v5, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v176.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v182.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v5, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v33.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v167.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v181.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v5, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v33.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v34.l, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v5, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v56.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v179.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v45.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v42.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x18 -; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136 +; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:248 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -186713,69 +185834,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -186784,95 +185905,91 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -186883,345 +186000,283 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8: @@ -209415,69 +208470,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -209486,95 +208541,91 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -209585,345 +208636,283 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 64b5ecc..582f31b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -4125,19 +4125,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -4152,94 +4152,71 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8614,19 +8591,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -8641,94 +8618,71 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12703,19 +12657,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -12730,94 +12684,71 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16408,19 +16339,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -16435,94 +16366,71 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19833,19 +19741,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -19860,94 +19768,71 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22745,19 +22630,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -22772,94 +22657,71 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24960,19 +24822,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -24987,94 +24849,71 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index cb4b3bd..0a73571 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -6298,31 +6298,33 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -6335,48 +6337,43 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -6387,122 +6384,88 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13349,31 +13312,33 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -13386,48 +13351,43 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -13438,122 +13398,88 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19888,31 +19814,33 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -19925,48 +19853,43 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -19977,122 +19900,88 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25929,31 +25818,33 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -25966,48 +25857,43 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -26018,122 +25904,88 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 3aaf254..b622e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -3044,91 +3044,66 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: @@ -5025,39 +5000,41 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 @@ -5071,63 +5048,53 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -5140,147 +5107,110 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9991,91 +9921,66 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: @@ -11997,39 +11902,41 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 @@ -12043,63 +11950,53 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -12112,147 +12009,110 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16367,91 +16227,66 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: @@ -22484,91 +22319,66 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8: @@ -28791,39 +28601,38 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 @@ -28837,65 +28646,55 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 @@ -28906,146 +28705,110 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -30878,91 +30641,66 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8: @@ -32912,39 +32650,38 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 @@ -32958,65 +32695,55 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 @@ -33027,146 +32754,110 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -35022,91 +34713,66 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 632b03c..e6c7b1a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2279,17 +2279,13 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2301,13 +2297,9 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4530,17 +4522,13 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4552,13 +4540,9 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6487,17 +6471,13 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6509,13 +6489,9 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8138,17 +8114,13 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8160,13 +8132,9 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9502,17 +9470,13 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9524,13 +9488,9 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10212,17 +10172,13 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10234,13 +10190,9 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index d3fbba3..bff054f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -8921,133 +8921,98 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -12574,53 +12539,52 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -12633,98 +12597,82 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -12745,226 +12693,170 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23576,133 +23468,98 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -27358,53 +27215,52 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -27417,98 +27273,82 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -27529,226 +27369,170 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -37760,133 +37544,98 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -41418,53 +41167,52 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -41477,98 +41225,82 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -41589,226 +41321,170 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -50954,133 +50630,98 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -54638,53 +54279,52 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -54697,98 +54337,82 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -54809,226 +54433,170 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -64107,133 +63675,98 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -76401,133 +75934,98 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -85053,57 +84551,57 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -85111,29 +84609,29 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v3 @@ -85141,11 +84639,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v2.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v3.h @@ -85155,26 +84653,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v5.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v7.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v14.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 @@ -85187,71 +84685,72 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v20, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v18, v23 :: v_dual_lshlrev_b32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v18, v23 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v23, v26, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v20 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v53.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v1, v20, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v1, v19 :: v_dual_and_b32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v4, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -85304,305 +84803,266 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v65.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v66, v4, v5 :: v_dual_lshlrev_b32 v5, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v5, 16, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v21 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v1, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v22 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v3, v8 :: v_dual_and_b32 v3, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v3, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v1, v4 :: v_dual_add_f32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v1, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v68.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v67.h ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v67.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v23 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v2, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v68.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v82.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v26 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v80.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v26 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v3, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v80.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v71.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v2, v3 :: v_dual_add_f32 v2, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v97.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v6, v7 :: v_dual_and_b32 v5, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v96.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v33 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v112, v4, v6 :: v_dual_add_f32 v1, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v112.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v98.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v113, v4, v6 :: v_dual_add_f32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v2, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v113.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[23:24] ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v7, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v103.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v113.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v115.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v117.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[37:38] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[38:39] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[32:33] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v38 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v131.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v50.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v1.l, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v112.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v69.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v84.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v83.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v70.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v117.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index ecc715c..11f90b9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -3067,9 +3067,9 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3085,52 +3085,47 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6210,9 +6205,9 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6228,52 +6223,47 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9050,9 +9040,9 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9068,52 +9058,47 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11590,9 +11575,9 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -11608,52 +11593,47 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13809,9 +13789,9 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -13827,52 +13807,47 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15630,9 +15605,9 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -15648,52 +15623,47 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16934,9 +16904,9 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -16952,52 +16922,47 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 685e2fb..9a6ea1b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1104,16 +1104,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -1128,37 +1127,28 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true @@ -1166,36 +1156,26 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4254,16 +4234,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -4278,37 +4257,28 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -4316,36 +4286,26 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6909,12 +6869,12 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -6929,37 +6889,28 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true @@ -6967,36 +6918,26 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8669,12 +8610,12 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -8689,37 +8630,28 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true @@ -8727,36 +8659,26 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10079,12 +10001,12 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -10099,37 +10021,28 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true @@ -10137,36 +10050,26 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index cbf6b66..7dbbeaa 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -3632,13 +3632,9 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -3813,16 +3809,12 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec ; GFX1250-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v17.l ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v0.l, v0.h, 0xff bitop3:0xec +; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v1, off ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 26f204f..14897b6 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1771,33 +1771,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1] +; GFX11-TRUE16-NEXT: global_load_b32 v5, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v5.h, 9 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v5.h +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x900, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x900, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 78a961e..415828f 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -4858,7 +4858,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_cbranch_vccz .LBB9_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else20 ; SI-NEXT: v_bfi_b32 v7, s0, 0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6 @@ -4869,7 +4869,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB9_2: ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB9_3: ; %frem.compute +; SI-NEXT: .LBB9_3: ; %frem.compute19 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 @@ -4905,10 +4905,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB9_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB9_5: ; %frem.loop_body +; SI-NEXT: .LBB9_5: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -4923,7 +4923,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB9_5 ; SI-NEXT: ; %bb.6: ; %Flow55 ; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: .LBB9_7: ; %frem.loop_exit +; SI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1 ; SI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -4944,7 +4944,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 ; SI-NEXT: s_cbranch_vccz .LBB9_10 -; SI-NEXT: ; %bb.9: ; %frem.else20 +; SI-NEXT: ; %bb.9: ; %frem.else ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v8, s0, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -4956,7 +4956,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB9_10: ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB9_11: ; %frem.compute19 +; SI-NEXT: .LBB9_11: ; %frem.compute ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v6 @@ -4992,10 +4992,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB9_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB9_13: ; %frem.loop_body27 +; SI-NEXT: .LBB9_13: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v8, v6 ; SI-NEXT: v_mul_f32_e32 v6, v8, v7 @@ -5010,7 +5010,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB9_13 ; SI-NEXT: ; %bb.14: ; %Flow ; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; SI-NEXT: .LBB9_15: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1 ; SI-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -5084,7 +5084,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB9_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else20 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_bfi_b32 v7, s0, 0, v2 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -5093,7 +5093,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_8 ; CI-NEXT: .LBB9_2: ; CI-NEXT: ; implicit-def: $vgpr4 -; CI-NEXT: .LBB9_3: ; %frem.compute +; CI-NEXT: .LBB9_3: ; %frem.compute19 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; CI-NEXT: v_frexp_mant_f32_e32 v4, v6 ; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -5118,10 +5118,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 ; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6 -; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: .LBB9_5: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -5136,7 +5136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB9_5 ; CI-NEXT: ; %bb.6: ; %Flow55 ; CI-NEXT: v_mov_b32_e32 v7, v9 -; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6 ; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 ; CI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -5157,7 +5157,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 ; CI-NEXT: s_cbranch_vccz .LBB9_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v8, s0, 0, v0 @@ -5167,7 +5167,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_16 ; CI-NEXT: .LBB9_10: ; CI-NEXT: ; implicit-def: $vgpr5 -; CI-NEXT: .LBB9_11: ; %frem.compute19 +; CI-NEXT: .LBB9_11: ; %frem.compute ; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7 ; CI-NEXT: v_frexp_mant_f32_e32 v5, v7 ; CI-NEXT: v_frexp_mant_f32_e32 v7, v6 @@ -5192,10 +5192,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 ; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 ; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7 -; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: .LBB9_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v10, v8 ; CI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -5210,7 +5210,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB9_13 ; CI-NEXT: ; %bb.14: ; %Flow ; CI-NEXT: v_mov_b32_e32 v8, v10 -; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7 ; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 ; CI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -5275,7 +5275,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v3, |v1| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; VI-NEXT: s_cbranch_vccz .LBB9_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else20 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v2, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 @@ -5284,7 +5284,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_8 ; VI-NEXT: .LBB9_2: ; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: .LBB9_3: ; %frem.compute +; VI-NEXT: .LBB9_3: ; %frem.compute19 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; VI-NEXT: v_frexp_mant_f32_e32 v2, v4 ; VI-NEXT: v_frexp_mant_f32_e32 v4, v3 @@ -5309,10 +5309,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 ; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8 ; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v4 -; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: .LBB9_5: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -5327,7 +5327,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB9_5 ; VI-NEXT: ; %bb.6: ; %Flow55 ; VI-NEXT: v_mov_b32_e32 v5, v7 -; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 ; VI-NEXT: v_ldexp_f32 v4, v5, v4 ; VI-NEXT: v_mul_f32_e32 v5, v4, v6 @@ -5347,7 +5347,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v6, |v4| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 ; VI-NEXT: s_cbranch_vccz .LBB9_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v5, s2, 0, v3 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 @@ -5356,7 +5356,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_16 ; VI-NEXT: .LBB9_10: ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: .LBB9_11: ; %frem.compute19 +; VI-NEXT: .LBB9_11: ; %frem.compute ; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7 ; VI-NEXT: v_frexp_mant_f32_e32 v5, v7 ; VI-NEXT: v_frexp_mant_f32_e32 v7, v6 @@ -5381,10 +5381,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 ; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11 ; VI-NEXT: v_add_u32_e32 v7, vcc, 11, v7 -; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: .LBB9_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v10, v8 ; VI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -5399,7 +5399,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB9_13 ; VI-NEXT: ; %bb.14: ; %Flow ; VI-NEXT: v_mov_b32_e32 v8, v10 -; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v7, vcc, -10, v7 ; VI-NEXT: v_ldexp_f32 v7, v8, v7 ; VI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -5443,7 +5443,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; GFX9-NEXT: s_cbranch_vccz .LBB9_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else20 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v2, s2, 0, v1 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 @@ -5452,7 +5452,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB9_8 ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: .LBB9_3: ; %frem.compute +; GFX9-NEXT: .LBB9_3: ; %frem.compute19 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v3 @@ -5477,10 +5477,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v4, 11, v4 -; GFX9-NEXT: .LBB9_5: ; %frem.loop_body +; GFX9-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -5495,7 +5495,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB9_5 ; GFX9-NEXT: ; %bb.6: ; %Flow55 ; GFX9-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX9-NEXT: v_add_u32_e32 v4, -10, v4 ; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 ; GFX9-NEXT: s_cbranch_vccz .LBB9_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else20 +; GFX9-NEXT: ; %bb.9: ; %frem.else ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v3 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -5523,7 +5523,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB9_16 ; GFX9-NEXT: .LBB9_10: ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: .LBB9_11: ; %frem.compute19 +; GFX9-NEXT: .LBB9_11: ; %frem.compute ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -5548,10 +5548,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 ; GFX9-NEXT: v_add_u32_e32 v6, 11, v6 -; GFX9-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX9-NEXT: .LBB9_13: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -5566,7 +5566,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB9_13 ; GFX9-NEXT: ; %bb.14: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v6, -10, v6 ; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -5612,7 +5612,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: s_cbranch_vccz .LBB9_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else20 ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, 0, v1 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo @@ -5620,7 +5620,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB9_8 ; GFX10-NEXT: .LBB9_2: ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: .LBB9_3: ; %frem.compute +; GFX10-NEXT: .LBB9_3: ; %frem.compute19 ; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v3 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 @@ -5647,10 +5647,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB9_5: ; %frem.loop_body +; GFX10-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v7, v4 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -5666,7 +5666,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow55 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX10-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -5684,7 +5684,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v6, |v3| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v4 ; GFX10-NEXT: s_cbranch_vccz .LBB9_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else20 +; GFX10-NEXT: ; %bb.9: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, 0, v3 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc_lo @@ -5692,7 +5692,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB9_16 ; GFX10-NEXT: .LBB9_10: ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: .LBB9_11: ; %frem.compute19 +; GFX10-NEXT: .LBB9_11: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v6 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 ; GFX10-NEXT: v_ldexp_f32 v6, v5, 11 @@ -5719,10 +5719,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX10-NEXT: .LBB9_13: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v6, v9 -; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -5782,7 +5782,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 @@ -5793,7 +5793,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB9_8 ; GFX11-TRUE16-NEXT: .LBB9_2: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 -; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v3 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 @@ -5829,11 +5829,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v4 @@ -5853,7 +5853,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow55 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -5880,7 +5880,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10 -; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 @@ -5891,7 +5891,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB9_16 ; GFX11-TRUE16-NEXT: .LBB9_10: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v6 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5927,11 +5927,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v7 @@ -5951,7 +5951,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, -10, v9 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v7, v9 @@ -6002,7 +6002,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6011,7 +6011,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB9_8 ; GFX11-FAKE16-NEXT: .LBB9_2: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v3 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 @@ -6047,11 +6047,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v4 @@ -6071,7 +6071,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow55 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -6097,7 +6097,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_10 -; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else ; GFX11-FAKE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6106,7 +6106,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB9_16 ; GFX11-FAKE16-NEXT: .LBB9_10: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 -; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v7 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) @@ -6142,11 +6142,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v7 @@ -6166,7 +6166,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, -10, v9 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v7, v9 @@ -6220,7 +6220,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB9_8 ; GFX1150-TRUE16-NEXT: .LBB9_2: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6267,11 +6267,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s5, s6, s5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s5, s5, 11 -; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -6293,7 +6293,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -6323,7 +6323,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 @@ -6335,7 +6335,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB9_16 ; GFX1150-TRUE16-NEXT: .LBB9_10: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -6370,11 +6370,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s7, s8, s7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s7, s7, 11 -; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -6396,7 +6396,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -6459,7 +6459,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -6469,7 +6469,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB9_8 ; GFX1150-FAKE16-NEXT: .LBB9_2: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6504,11 +6504,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s5, s6, s5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s5, s5, 11 -; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -6530,7 +6530,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -6559,7 +6559,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s7 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -6569,7 +6569,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB9_16 ; GFX1150-FAKE16-NEXT: .LBB9_10: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -6604,11 +6604,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s7, s8, s7 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s7, s7, 11 -; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -6630,7 +6630,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -6690,7 +6690,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 @@ -6702,7 +6702,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB9_8 ; GFX1200-TRUE16-NEXT: .LBB9_2: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6737,11 +6737,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s5, s6, s5 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s5, s5, 11 -; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -6765,7 +6765,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -6799,7 +6799,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 @@ -6811,7 +6811,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB9_16 ; GFX1200-TRUE16-NEXT: .LBB9_10: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -6847,11 +6847,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s7, s8, s7 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s7, s7, 11 -; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -6875,7 +6875,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -6940,7 +6940,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -6950,7 +6950,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB9_8 ; GFX1200-FAKE16-NEXT: .LBB9_2: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6986,11 +6986,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s5, s6, s5 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s5, s5, 11 -; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -7014,7 +7014,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -7047,7 +7047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s7 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB9_16 ; GFX1200-FAKE16-NEXT: .LBB9_10: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -7094,11 +7094,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s7, s8, s7 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s7, s7, 11 -; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -7122,7 +7122,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -7208,7 +7208,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 ; SI-NEXT: s_cbranch_vccz .LBB10_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else86 ; SI-NEXT: v_bfi_b32 v11, s0, 0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10 @@ -7219,7 +7219,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_2: ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_3: ; %frem.compute +; SI-NEXT: .LBB10_3: ; %frem.compute85 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 @@ -7255,10 +7255,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_5: ; %frem.loop_body +; SI-NEXT: .LBB10_5: ; %frem.loop_body93 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v11, v9 ; SI-NEXT: v_mul_f32_e32 v9, v11, v10 @@ -7273,7 +7273,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_5 ; SI-NEXT: ; %bb.6: ; %Flow133 ; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: .LBB10_7: ; %frem.loop_exit +; SI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1 ; SI-NEXT: v_mul_f32_e32 v10, v9, v10 @@ -7294,7 +7294,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11 ; SI-NEXT: s_cbranch_vccz .LBB10_10 -; SI-NEXT: ; %bb.9: ; %frem.else20 +; SI-NEXT: ; %bb.9: ; %frem.else53 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v12, s0, 0, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 @@ -7306,7 +7306,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_10: ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_11: ; %frem.compute19 +; SI-NEXT: .LBB10_11: ; %frem.compute52 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10 @@ -7342,10 +7342,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_13: ; %frem.loop_body27 +; SI-NEXT: .LBB10_13: ; %frem.loop_body60 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v12, v10 ; SI-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -7360,7 +7360,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_13 ; SI-NEXT: ; %bb.14: ; %Flow129 ; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; SI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1 ; SI-NEXT: v_mul_f32_e32 v11, v10, v11 @@ -7381,7 +7381,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12 ; SI-NEXT: s_cbranch_vccz .LBB10_18 -; SI-NEXT: ; %bb.17: ; %frem.else53 +; SI-NEXT: ; %bb.17: ; %frem.else20 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v13, s0, 0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 @@ -7393,7 +7393,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_18: ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_19: ; %frem.compute52 +; SI-NEXT: .LBB10_19: ; %frem.compute19 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11 @@ -7429,10 +7429,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_23 -; SI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; SI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_21: ; %frem.loop_body60 +; SI-NEXT: .LBB10_21: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v13, v11 ; SI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -7447,7 +7447,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_21 ; SI-NEXT: ; %bb.22: ; %Flow125 ; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; SI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1 ; SI-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -7468,7 +7468,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13 ; SI-NEXT: s_cbranch_vccz .LBB10_26 -; SI-NEXT: ; %bb.25: ; %frem.else86 +; SI-NEXT: ; %bb.25: ; %frem.else ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v14, s0, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -7480,7 +7480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_26: ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_27: ; %frem.compute85 +; SI-NEXT: .LBB10_27: ; %frem.compute ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 @@ -7516,10 +7516,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_31 -; SI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_29: ; %frem.loop_body93 +; SI-NEXT: .LBB10_29: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v14, v12 ; SI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -7534,7 +7534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_29 ; SI-NEXT: ; %bb.30: ; %Flow ; SI-NEXT: v_mov_b32_e32 v12, v14 -; SI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; SI-NEXT: .LBB10_31: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1 ; SI-NEXT: v_mul_f32_e32 v13, v12, v13 @@ -7638,7 +7638,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7 ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB10_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else86 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; CI-NEXT: v_bfi_b32 v11, s0, 0, v6 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 @@ -7647,7 +7647,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_8 ; CI-NEXT: .LBB10_2: ; CI-NEXT: ; implicit-def: $vgpr8 -; CI-NEXT: .LBB10_3: ; %frem.compute +; CI-NEXT: .LBB10_3: ; %frem.compute85 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 ; CI-NEXT: v_frexp_mant_f32_e32 v8, v10 ; CI-NEXT: v_frexp_mant_f32_e32 v10, v9 @@ -7672,10 +7672,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 ; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 ; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10 -; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: .LBB10_5: ; %frem.loop_body93 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v13, v11 ; CI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -7690,7 +7690,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_5 ; CI-NEXT: ; %bb.6: ; %Flow133 ; CI-NEXT: v_mov_b32_e32 v11, v13 -; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10 ; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 ; CI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -7711,7 +7711,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 ; CI-NEXT: s_cbranch_vccz .LBB10_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else53 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v12, s0, 0, v4 @@ -7721,7 +7721,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_16 ; CI-NEXT: .LBB10_10: ; CI-NEXT: ; implicit-def: $vgpr9 -; CI-NEXT: .LBB10_11: ; %frem.compute19 +; CI-NEXT: .LBB10_11: ; %frem.compute52 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 ; CI-NEXT: v_frexp_mant_f32_e32 v9, v11 ; CI-NEXT: v_frexp_mant_f32_e32 v11, v10 @@ -7746,10 +7746,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 ; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 ; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11 -; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: .LBB10_13: ; %frem.loop_body60 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v14, v12 ; CI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -7764,7 +7764,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_13 ; CI-NEXT: ; %bb.14: ; %Flow129 ; CI-NEXT: v_mov_b32_e32 v12, v14 -; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11 ; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 ; CI-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -7785,7 +7785,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11 ; CI-NEXT: s_cbranch_vccz .LBB10_18 -; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: ; %bb.17: ; %frem.else20 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v13, s0, 0, v2 @@ -7795,7 +7795,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_24 ; CI-NEXT: .LBB10_18: ; CI-NEXT: ; implicit-def: $vgpr10 -; CI-NEXT: .LBB10_19: ; %frem.compute52 +; CI-NEXT: .LBB10_19: ; %frem.compute19 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12 ; CI-NEXT: v_frexp_mant_f32_e32 v10, v12 ; CI-NEXT: v_frexp_mant_f32_e32 v12, v11 @@ -7820,10 +7820,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12 ; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_23 -; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 ; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12 -; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: .LBB10_21: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v15, v13 ; CI-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -7838,7 +7838,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_21 ; CI-NEXT: ; %bb.22: ; %Flow125 ; CI-NEXT: v_mov_b32_e32 v13, v15 -; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12 ; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 ; CI-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -7859,7 +7859,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 ; CI-NEXT: s_cbranch_vccz .LBB10_26 -; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v14, s0, 0, v0 @@ -7869,7 +7869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_32 ; CI-NEXT: .LBB10_26: ; CI-NEXT: ; implicit-def: $vgpr11 -; CI-NEXT: .LBB10_27: ; %frem.compute85 +; CI-NEXT: .LBB10_27: ; %frem.compute ; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 ; CI-NEXT: v_frexp_mant_f32_e32 v11, v13 ; CI-NEXT: v_frexp_mant_f32_e32 v13, v12 @@ -7894,10 +7894,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 ; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_31 -; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 ; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13 -; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: .LBB10_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v16, v14 ; CI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -7912,7 +7912,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_29 ; CI-NEXT: ; %bb.30: ; %Flow ; CI-NEXT: v_mov_b32_e32 v14, v16 -; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13 ; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 ; CI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -8001,7 +8001,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v5, |v2| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 ; VI-NEXT: s_cbranch_vccz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else86 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -8010,7 +8010,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_8 ; VI-NEXT: .LBB10_2: ; VI-NEXT: ; implicit-def: $vgpr4 -; VI-NEXT: .LBB10_3: ; %frem.compute +; VI-NEXT: .LBB10_3: ; %frem.compute85 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; VI-NEXT: v_frexp_mant_f32_e32 v4, v6 ; VI-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -8035,10 +8035,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10 ; VI-NEXT: v_add_u32_e32 v6, vcc, 11, v6 -; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: .LBB10_5: ; %frem.loop_body93 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -8053,7 +8053,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_5 ; VI-NEXT: ; %bb.6: ; %Flow133 ; VI-NEXT: v_mov_b32_e32 v7, v9 -; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; VI-NEXT: v_add_u32_e32 v6, vcc, -10, v6 ; VI-NEXT: v_ldexp_f32 v6, v7, v6 ; VI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -8073,7 +8073,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v8, |v6| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 ; VI-NEXT: s_cbranch_vccz .LBB10_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else53 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v7, s2, 0, v5 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8 @@ -8082,7 +8082,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_16 ; VI-NEXT: .LBB10_10: ; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: .LBB10_11: ; %frem.compute19 +; VI-NEXT: .LBB10_11: ; %frem.compute52 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v12, v9 ; VI-NEXT: v_frexp_mant_f32_e32 v7, v9 ; VI-NEXT: v_frexp_mant_f32_e32 v9, v8 @@ -8107,10 +8107,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 ; VI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; VI-NEXT: v_sub_u32_e32 v9, vcc, v12, v13 ; VI-NEXT: v_add_u32_e32 v9, vcc, 11, v9 -; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: .LBB10_13: ; %frem.loop_body60 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v12, v10 ; VI-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -8125,7 +8125,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_13 ; VI-NEXT: ; %bb.14: ; %Flow129 ; VI-NEXT: v_mov_b32_e32 v10, v12 -; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; VI-NEXT: v_add_u32_e32 v9, vcc, -10, v9 ; VI-NEXT: v_ldexp_f32 v9, v10, v9 ; VI-NEXT: v_mul_f32_e32 v10, v9, v11 @@ -8143,7 +8143,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v9, |v3| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v9 ; VI-NEXT: s_cbranch_vccz .LBB10_18 -; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: ; %bb.17: ; %frem.else20 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v8, s2, 0, v1 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 @@ -8152,7 +8152,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_24 ; VI-NEXT: .LBB10_18: ; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: .LBB10_19: ; %frem.compute52 +; VI-NEXT: .LBB10_19: ; %frem.compute19 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 ; VI-NEXT: v_frexp_mant_f32_e32 v8, v10 ; VI-NEXT: v_frexp_mant_f32_e32 v10, v9 @@ -8177,10 +8177,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 ; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_23 -; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14 ; VI-NEXT: v_add_u32_e32 v10, vcc, 11, v10 -; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: .LBB10_21: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v13, v11 ; VI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -8195,7 +8195,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_21 ; VI-NEXT: ; %bb.22: ; %Flow125 ; VI-NEXT: v_mov_b32_e32 v11, v13 -; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v10, vcc, -10, v10 ; VI-NEXT: v_ldexp_f32 v10, v11, v10 ; VI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -8215,7 +8215,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v12, |v10| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 ; VI-NEXT: s_cbranch_vccz .LBB10_26 -; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v11, s2, 0, v9 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12 @@ -8224,7 +8224,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_32 ; VI-NEXT: .LBB10_26: ; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB10_27: ; %frem.compute85 +; VI-NEXT: .LBB10_27: ; %frem.compute ; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 ; VI-NEXT: v_frexp_mant_f32_e32 v11, v13 ; VI-NEXT: v_frexp_mant_f32_e32 v13, v12 @@ -8249,10 +8249,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 ; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_31 -; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17 ; VI-NEXT: v_add_u32_e32 v13, vcc, 11, v13 -; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: .LBB10_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v16, v14 ; VI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -8267,7 +8267,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_29 ; VI-NEXT: ; %bb.30: ; %Flow ; VI-NEXT: v_mov_b32_e32 v14, v16 -; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v13, vcc, -10, v13 ; VI-NEXT: v_ldexp_f32 v13, v14, v13 ; VI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -8320,7 +8320,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_e64 v5, |v0| ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 ; GFX9-NEXT: s_cbranch_vccz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else86 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v2 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -8329,7 +8329,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_8 ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: .LBB10_3: ; %frem.compute +; GFX9-NEXT: .LBB10_3: ; %frem.compute85 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -8354,10 +8354,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 ; GFX9-NEXT: v_add_u32_e32 v6, 11, v6 -; GFX9-NEXT: .LBB10_5: ; %frem.loop_body +; GFX9-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -8372,7 +8372,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_5 ; GFX9-NEXT: ; %bb.6: ; %Flow133 ; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX9-NEXT: v_add_u32_e32 v6, -10, v6 ; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -8391,7 +8391,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 ; GFX9-NEXT: s_cbranch_vccz .LBB10_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else20 +; GFX9-NEXT: ; %bb.9: ; %frem.else53 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v6, s2, 0, v5 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 @@ -8400,7 +8400,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_16 ; GFX9-NEXT: .LBB10_10: ; GFX9-NEXT: ; implicit-def: $vgpr6 -; GFX9-NEXT: .LBB10_11: ; %frem.compute19 +; GFX9-NEXT: .LBB10_11: ; %frem.compute52 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v11, v8 ; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v8 ; GFX9-NEXT: v_frexp_mant_f32_e32 v8, v7 @@ -8425,10 +8425,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8 ; GFX9-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX9-NEXT: v_sub_u32_e32 v8, v11, v12 ; GFX9-NEXT: v_add_u32_e32 v8, 11, v8 -; GFX9-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX9-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v11, v9 ; GFX9-NEXT: v_mul_f32_e32 v9, v11, v10 @@ -8443,7 +8443,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_13 ; GFX9-NEXT: ; %bb.14: ; %Flow129 ; GFX9-NEXT: v_mov_b32_e32 v9, v11 -; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX9-NEXT: v_add_u32_e32 v8, -10, v8 ; GFX9-NEXT: v_ldexp_f32 v8, v9, v8 ; GFX9-NEXT: v_mul_f32_e32 v9, v8, v10 @@ -8461,7 +8461,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_e64 v8, |v1| ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 ; GFX9-NEXT: s_cbranch_vccz .LBB10_18 -; GFX9-NEXT: ; %bb.17: ; %frem.else53 +; GFX9-NEXT: ; %bb.17: ; %frem.else20 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v7, s2, 0, v3 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8 @@ -8470,7 +8470,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_24 ; GFX9-NEXT: .LBB10_18: ; GFX9-NEXT: ; implicit-def: $vgpr7 -; GFX9-NEXT: .LBB10_19: ; %frem.compute52 +; GFX9-NEXT: .LBB10_19: ; %frem.compute19 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v12, v9 ; GFX9-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v8 @@ -8495,10 +8495,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 ; GFX9-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX9-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX9-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX9-NEXT: v_sub_u32_e32 v9, v12, v13 ; GFX9-NEXT: v_add_u32_e32 v9, 11, v9 -; GFX9-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX9-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -8513,7 +8513,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_21 ; GFX9-NEXT: ; %bb.22: ; %Flow125 ; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX9-NEXT: v_add_u32_e32 v9, -10, v9 ; GFX9-NEXT: v_ldexp_f32 v9, v10, v9 ; GFX9-NEXT: v_mul_f32_e32 v10, v9, v11 @@ -8532,7 +8532,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 ; GFX9-NEXT: s_cbranch_vccz .LBB10_26 -; GFX9-NEXT: ; %bb.25: ; %frem.else86 +; GFX9-NEXT: ; %bb.25: ; %frem.else ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v8 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10 @@ -8541,7 +8541,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_32 ; GFX9-NEXT: .LBB10_26: ; GFX9-NEXT: ; implicit-def: $vgpr9 -; GFX9-NEXT: .LBB10_27: ; %frem.compute85 +; GFX9-NEXT: .LBB10_27: ; %frem.compute ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 ; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v11 ; GFX9-NEXT: v_frexp_mant_f32_e32 v11, v10 @@ -8566,10 +8566,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 ; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX9-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15 ; GFX9-NEXT: v_add_u32_e32 v11, 11, v11 -; GFX9-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX9-NEXT: .LBB10_29: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -8584,7 +8584,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_29 ; GFX9-NEXT: ; %bb.30: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v12, v14 -; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v11, -10, v11 ; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -8640,7 +8640,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v5, |v0| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX10-NEXT: s_cbranch_vccz .LBB10_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else86 ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, 0, v2 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc_lo @@ -8648,7 +8648,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_8 ; GFX10-NEXT: .LBB10_2: ; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: .LBB10_3: ; %frem.compute +; GFX10-NEXT: .LBB10_3: ; %frem.compute85 ; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX10-NEXT: v_frexp_mant_f32_e32 v8, v5 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 @@ -8675,10 +8675,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_5: ; %frem.loop_body +; GFX10-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8694,7 +8694,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow133 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v6, v9 -; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -8712,7 +8712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v5| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v8, v7 ; GFX10-NEXT: s_cbranch_vccz .LBB10_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else20 +; GFX10-NEXT: ; %bb.9: ; %frem.else53 ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, 0, v5 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v8, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo @@ -8720,7 +8720,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_16 ; GFX10-NEXT: .LBB10_10: ; GFX10-NEXT: ; implicit-def: $vgpr6 -; GFX10-NEXT: .LBB10_11: ; %frem.compute19 +; GFX10-NEXT: .LBB10_11: ; %frem.compute52 ; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v8 ; GFX10-NEXT: v_frexp_mant_f32_e32 v10, v7 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v8 @@ -8747,10 +8747,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v10 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX10-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8766,7 +8766,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow129 ; GFX10-NEXT: v_mov_b32_e32 v10, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, v11 -; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -10, v10 ; GFX10-NEXT: v_ldexp_f32 v8, v8, v10 ; GFX10-NEXT: v_mul_f32_e32 v9, v8, v9 @@ -8783,7 +8783,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v1| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX10-NEXT: s_cbranch_vccz .LBB10_18 -; GFX10-NEXT: ; %bb.17: ; %frem.else53 +; GFX10-NEXT: ; %bb.17: ; %frem.else20 ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, 0, v3 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc_lo @@ -8791,7 +8791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_24 ; GFX10-NEXT: .LBB10_18: ; GFX10-NEXT: ; implicit-def: $vgpr7 -; GFX10-NEXT: .LBB10_19: ; %frem.compute52 +; GFX10-NEXT: .LBB10_19: ; %frem.compute19 ; GFX10-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX10-NEXT: v_frexp_mant_f32_e32 v11, v8 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 @@ -8818,10 +8818,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11 ; GFX10-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX10-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX10-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX10-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8837,7 +8837,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.22: ; %Flow125 ; GFX10-NEXT: v_mov_b32_e32 v11, s2 ; GFX10-NEXT: v_mov_b32_e32 v9, v12 -; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX10-NEXT: v_add_nc_u32_e32 v11, -10, v11 ; GFX10-NEXT: v_ldexp_f32 v9, v9, v11 ; GFX10-NEXT: v_mul_f32_e32 v10, v9, v10 @@ -8855,7 +8855,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v11, |v8| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v11, v10 ; GFX10-NEXT: s_cbranch_vccz .LBB10_26 -; GFX10-NEXT: ; %bb.25: ; %frem.else86 +; GFX10-NEXT: ; %bb.25: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, 0, v8 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo @@ -8863,7 +8863,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_32 ; GFX10-NEXT: .LBB10_26: ; GFX10-NEXT: ; implicit-def: $vgpr9 -; GFX10-NEXT: .LBB10_27: ; %frem.compute85 +; GFX10-NEXT: .LBB10_27: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e32 v9, v11 ; GFX10-NEXT: v_frexp_mant_f32_e32 v13, v10 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v11 @@ -8890,10 +8890,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v13 ; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX10-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX10-NEXT: .LBB10_29: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8909,7 +8909,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.30: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v13, s2 ; GFX10-NEXT: v_mov_b32_e32 v11, v14 -; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v13, -10, v13 ; GFX10-NEXT: v_ldexp_f32 v11, v11, v13 ; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -8963,7 +8963,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 @@ -8974,7 +8974,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_8 ; GFX11-TRUE16-NEXT: .LBB10_2: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 -; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v5 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 @@ -9010,11 +9010,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v6 @@ -9034,7 +9034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow133 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v9 -; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, v8 @@ -9061,7 +9061,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10 -; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 @@ -9072,7 +9072,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_16 ; GFX11-TRUE16-NEXT: .LBB10_10: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v8 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 @@ -9108,11 +9108,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v9 @@ -9132,7 +9132,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow129 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v12 -; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, -10, v11 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v9, v11 @@ -9156,7 +9156,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18 -; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 @@ -9167,7 +9167,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_24 ; GFX11-TRUE16-NEXT: .LBB10_18: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 -; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v10 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v12, v9 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10 @@ -9203,11 +9203,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v10 @@ -9227,7 +9227,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.22: ; %Flow125 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v13 -; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, -10, v12 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v10, v12 @@ -9254,7 +9254,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26 -; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 @@ -9265,7 +9265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_32 ; GFX11-TRUE16-NEXT: .LBB10_26: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v13 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v15, v12 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13 @@ -9301,11 +9301,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, v13 @@ -9325,7 +9325,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.30: ; %Flow ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v16 -; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, -10, v15 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v13, v15 @@ -9388,7 +9388,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9397,7 +9397,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_8 ; GFX11-FAKE16-NEXT: .LBB10_2: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 -; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v5 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 @@ -9433,11 +9433,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v6 @@ -9457,7 +9457,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow133 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v9 -; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, v8 @@ -9483,7 +9483,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_10 -; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9492,7 +9492,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_16 ; GFX11-FAKE16-NEXT: .LBB10_10: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v8 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 @@ -9528,11 +9528,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v9 @@ -9552,7 +9552,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow129 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v12 -; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, -10, v11 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v9, v11 @@ -9575,7 +9575,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_18 -; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX11-FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9584,7 +9584,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_24 ; GFX11-FAKE16-NEXT: .LBB10_18: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 -; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v10 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v12, v9 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10 @@ -9620,11 +9620,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v10 @@ -9644,7 +9644,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.22: ; %Flow125 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v13 -; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, -10, v12 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v10, v12 @@ -9670,7 +9670,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_26 -; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else ; GFX11-FAKE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9679,7 +9679,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_32 ; GFX11-FAKE16-NEXT: .LBB10_26: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v13 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v15, v12 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13 @@ -9715,11 +9715,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v13 @@ -9739,7 +9739,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.30: ; %Flow ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v16 -; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, -10, v15 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v13, v15 @@ -9804,7 +9804,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 @@ -9816,7 +9816,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_8 ; GFX1150-TRUE16-NEXT: .LBB10_2: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -9851,11 +9851,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s6, s8, s6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s6, s6, 11 -; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -9877,7 +9877,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -9907,7 +9907,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -9919,7 +9919,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_16 ; GFX1150-TRUE16-NEXT: .LBB10_10: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -9954,11 +9954,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -9980,7 +9980,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -10008,7 +10008,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -10020,7 +10020,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_24 ; GFX1150-TRUE16-NEXT: .LBB10_18: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 -; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -10055,11 +10055,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, v4 @@ -10081,7 +10081,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -10111,7 +10111,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 @@ -10123,7 +10123,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_32 ; GFX1150-TRUE16-NEXT: .LBB10_26: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 -; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -10158,11 +10158,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s11, s12, s11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s11, s11, 11 -; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v8, v5 @@ -10184,7 +10184,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.30: ; %Flow ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -10265,7 +10265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10275,7 +10275,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_8 ; GFX1150-FAKE16-NEXT: .LBB10_2: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -10310,11 +10310,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s6, s8, s6 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s6, s6, 11 -; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -10336,7 +10336,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -10365,7 +10365,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10375,7 +10375,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_16 ; GFX1150-FAKE16-NEXT: .LBB10_10: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -10410,11 +10410,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -10436,7 +10436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -10463,7 +10463,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10473,7 +10473,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_24 ; GFX1150-FAKE16-NEXT: .LBB10_18: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -10508,11 +10508,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, v4 @@ -10534,7 +10534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -10563,7 +10563,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s12, s11 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10573,7 +10573,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_32 ; GFX1150-FAKE16-NEXT: .LBB10_26: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr3 -; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -10608,11 +10608,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s11, s12, s11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s11, s11, 11 -; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v8, v5 @@ -10634,7 +10634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.30: ; %Flow ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -10712,7 +10712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 @@ -10724,7 +10724,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_8 ; GFX1200-TRUE16-NEXT: .LBB10_2: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -10759,11 +10759,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s6, s8, s6 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s6, s6, 11 -; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -10787,7 +10787,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -10821,7 +10821,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -10833,7 +10833,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_16 ; GFX1200-TRUE16-NEXT: .LBB10_10: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -10869,11 +10869,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -10897,7 +10897,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -10928,7 +10928,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -10941,7 +10941,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_24 ; GFX1200-TRUE16-NEXT: .LBB10_18: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 -; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -10977,11 +10977,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, v4 @@ -11005,7 +11005,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -11039,7 +11039,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 @@ -11051,7 +11051,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_32 ; GFX1200-TRUE16-NEXT: .LBB10_26: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3 -; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -11087,11 +11087,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s11, s12, s11 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s11, s11, 11 -; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v8, v5 @@ -11115,7 +11115,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.30: ; %Flow ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -11203,7 +11203,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11213,7 +11213,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_8 ; GFX1200-FAKE16-NEXT: .LBB10_2: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -11249,11 +11249,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s6, s8, s6 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s6, s6, 11 -; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -11277,7 +11277,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -11310,7 +11310,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11321,7 +11321,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_16 ; GFX1200-FAKE16-NEXT: .LBB10_10: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -11357,11 +11357,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -11385,7 +11385,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -11415,7 +11415,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11426,7 +11426,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_24 ; GFX1200-FAKE16-NEXT: .LBB10_18: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -11462,11 +11462,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, v4 @@ -11490,7 +11490,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -11523,7 +11523,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s12, s11 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11534,7 +11534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_32 ; GFX1200-FAKE16-NEXT: .LBB10_26: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr3 -; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -11570,11 +11570,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s11, s12, s11 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s11, s11, 11 -; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v8, v5 @@ -11598,7 +11598,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.30: ; %Flow ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -11686,7 +11686,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB11_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else16 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -11697,7 +11697,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB11_2: ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB11_3: ; %frem.compute +; SI-NEXT: .LBB11_3: ; %frem.compute15 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 @@ -11733,10 +11733,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB11_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB11_5: ; %frem.loop_body +; SI-NEXT: .LBB11_5: ; %frem.loop_body23 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -11751,7 +11751,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB11_5 ; SI-NEXT: ; %bb.6: ; %Flow51 ; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: .LBB11_7: ; %frem.loop_exit +; SI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v5, v5, s3 ; SI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -11767,7 +11767,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB11_10 -; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: ; %bb.9: ; %frem.else ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v5, s2, 0, v1 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -11778,7 +11778,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB11_10: ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB11_11: ; %frem.compute15 +; SI-NEXT: .LBB11_11: ; %frem.compute ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 @@ -11814,10 +11814,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB11_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB11_13: ; %frem.loop_body23 +; SI-NEXT: .LBB11_13: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v8, v6 ; SI-NEXT: v_mul_f32_e32 v6, v8, v7 @@ -11832,7 +11832,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB11_13 ; SI-NEXT: ; %bb.14: ; %Flow ; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; SI-NEXT: .LBB11_15: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v6, v6, s3 ; SI-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -11877,7 +11877,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB11_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -11886,7 +11886,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_8 ; CI-NEXT: .LBB11_2: ; CI-NEXT: ; implicit-def: $vgpr4 -; CI-NEXT: .LBB11_3: ; %frem.compute +; CI-NEXT: .LBB11_3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; CI-NEXT: v_ldexp_f32_e64 v5, v5, 1 ; CI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 @@ -11911,10 +11911,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 ; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 ; CI-NEXT: v_add_i32_e32 v6, vcc, 12, v6 -; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: .LBB11_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -11929,7 +11929,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB11_5 ; CI-NEXT: ; %bb.6: ; %Flow51 ; CI-NEXT: v_mov_b32_e32 v7, v9 -; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6 ; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 ; CI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -11945,7 +11945,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB11_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v5, s2, 0, v1 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -11954,7 +11954,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_16 ; CI-NEXT: .LBB11_10: ; CI-NEXT: ; implicit-def: $vgpr5 -; CI-NEXT: .LBB11_11: ; %frem.compute15 +; CI-NEXT: .LBB11_11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1 ; CI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 @@ -11979,10 +11979,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 ; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 ; CI-NEXT: v_add_i32_e32 v7, vcc, 12, v7 -; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: .LBB11_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v10, v8 ; CI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -11997,7 +11997,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB11_13 ; CI-NEXT: ; %bb.14: ; %Flow ; CI-NEXT: v_mov_b32_e32 v8, v10 -; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7 ; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 ; CI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -12042,7 +12042,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -12051,7 +12051,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_8 ; VI-NEXT: .LBB11_2: ; VI-NEXT: ; implicit-def: $vgpr4 -; VI-NEXT: .LBB11_3: ; %frem.compute +; VI-NEXT: .LBB11_3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; VI-NEXT: v_ldexp_f32 v5, v5, 1 ; VI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 @@ -12076,10 +12076,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 ; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10 ; VI-NEXT: v_add_u32_e32 v6, vcc, 12, v6 -; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: .LBB11_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -12094,7 +12094,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB11_5 ; VI-NEXT: ; %bb.6: ; %Flow51 ; VI-NEXT: v_mov_b32_e32 v7, v9 -; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v6, vcc, -11, v6 ; VI-NEXT: v_ldexp_f32 v6, v7, v6 ; VI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -12110,7 +12110,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB11_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v5, s2, 0, v1 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -12119,7 +12119,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_16 ; VI-NEXT: .LBB11_10: ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: .LBB11_11: ; %frem.compute15 +; VI-NEXT: .LBB11_11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; VI-NEXT: v_ldexp_f32 v6, v6, 1 ; VI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 @@ -12144,10 +12144,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 ; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11 ; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v7 -; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: .LBB11_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v10, v8 ; VI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -12162,7 +12162,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB11_13 ; VI-NEXT: ; %bb.14: ; %Flow ; VI-NEXT: v_mov_b32_e32 v8, v10 -; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v7, vcc, -11, v7 ; VI-NEXT: v_ldexp_f32 v7, v8, v7 ; VI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -12202,7 +12202,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else16 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v0 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -12211,7 +12211,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB11_8 ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: .LBB11_3: ; %frem.compute +; GFX9-NEXT: .LBB11_3: ; %frem.compute15 ; GFX9-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; GFX9-NEXT: v_ldexp_f32 v5, v5, 1 ; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 @@ -12236,10 +12236,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 ; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 ; GFX9-NEXT: v_add_u32_e32 v6, 12, v6 -; GFX9-NEXT: .LBB11_5: ; %frem.loop_body +; GFX9-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -12254,7 +12254,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB11_5 ; GFX9-NEXT: ; %bb.6: ; %Flow51 ; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX9-NEXT: v_add_u32_e32 v6, -11, v6 ; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -12270,7 +12270,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB11_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: ; %bb.9: ; %frem.else ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v5, s2, 0, v1 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -12279,7 +12279,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB11_16 ; GFX9-NEXT: .LBB11_10: ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: .LBB11_11: ; %frem.compute15 +; GFX9-NEXT: .LBB11_11: ; %frem.compute ; GFX9-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; GFX9-NEXT: v_ldexp_f32 v6, v6, 1 ; GFX9-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 @@ -12304,10 +12304,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v7, v10, v11 ; GFX9-NEXT: v_add_u32_e32 v7, 12, v7 -; GFX9-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX9-NEXT: .LBB11_13: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v10, v8 ; GFX9-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -12322,7 +12322,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB11_13 ; GFX9-NEXT: ; %bb.14: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v7, -11, v7 ; GFX9-NEXT: v_ldexp_f32 v7, v8, v7 ; GFX9-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -12363,7 +12363,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB11_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else16 ; GFX10-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo @@ -12371,7 +12371,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB11_8 ; GFX10-NEXT: .LBB11_2: ; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: .LBB11_3: ; %frem.compute +; GFX10-NEXT: .LBB11_3: ; %frem.compute15 ; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; GFX10-NEXT: v_frexp_mant_f32_e64 v4, |v0| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 @@ -12398,10 +12398,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB11_5: ; %frem.loop_body +; GFX10-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -12417,7 +12417,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow51 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v6, v9 -; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX10-NEXT: v_add_nc_u32_e32 v8, -11, v8 ; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -12432,7 +12432,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB11_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: ; %bb.9: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3| ; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc_lo @@ -12440,7 +12440,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB11_16 ; GFX10-NEXT: .LBB11_10: ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: .LBB11_11: ; %frem.compute15 +; GFX10-NEXT: .LBB11_11: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v1| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 @@ -12467,10 +12467,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9 ; GFX10-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX10-NEXT: .LBB11_13: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -12486,7 +12486,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v9, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, v10 -; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v9, -11, v9 ; GFX10-NEXT: v_ldexp_f32 v7, v7, v9 ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v8 @@ -12524,7 +12524,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: ; %bb.1: ; %frem.else16 ; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12533,7 +12533,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB11_8 ; GFX11-NEXT: .LBB11_2: ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: .LBB11_3: ; %frem.compute +; GFX11-NEXT: .LBB11_3: ; %frem.compute15 ; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; GFX11-NEXT: v_frexp_mant_f32_e64 v4, |v0| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 @@ -12569,11 +12569,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB11_5: ; %frem.loop_body +; GFX11-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v9, v6 @@ -12593,7 +12593,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.6: ; %Flow51 ; GFX11-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-NEXT: v_mov_b32_e32 v6, v9 -; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v8, -11, v8 ; GFX11-NEXT: v_ldexp_f32 v6, v6, v8 @@ -12613,7 +12613,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB11_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: ; %bb.9: ; %frem.else ; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12622,7 +12622,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB11_16 ; GFX11-NEXT: .LBB11_10: ; GFX11-NEXT: ; implicit-def: $vgpr5 -; GFX11-NEXT: .LBB11_11: ; %frem.compute15 +; GFX11-NEXT: .LBB11_11: ; %frem.compute ; GFX11-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v1| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 @@ -12658,11 +12658,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX11-NEXT: .LBB11_13: ; %frem.loop_body ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v10, v7 @@ -12682,7 +12682,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.14: ; %Flow ; GFX11-NEXT: v_mov_b32_e32 v9, s2 ; GFX11-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v9, -11, v9 ; GFX11-NEXT: v_ldexp_f32 v7, v7, v9 @@ -12730,7 +12730,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8 ; GFX1150-NEXT: s_cbranch_scc0 .LBB11_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: ; %bb.1: ; %frem.else16 ; GFX1150-NEXT: s_cmp_eq_f32 s3, s8 ; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -12740,7 +12740,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB11_8 ; GFX1150-NEXT: .LBB11_2: ; GFX1150-NEXT: ; implicit-def: $vgpr0 -; GFX1150-NEXT: .LBB11_3: ; %frem.compute +; GFX1150-NEXT: .LBB11_3: ; %frem.compute15 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s6| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -12775,11 +12775,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1150-NEXT: s_sub_i32 s7, s7, s8 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s7, s7, 12 -; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body +; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v5, v2 @@ -12801,7 +12801,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.6: ; %Flow51 ; GFX1150-NEXT: v_mov_b32_e32 v4, s7 ; GFX1150-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4 @@ -12824,7 +12824,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s6, s8 ; GFX1150-NEXT: s_cbranch_scc0 .LBB11_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: ; %bb.9: ; %frem.else ; GFX1150-NEXT: s_cmp_eq_f32 s6, s8 ; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -12834,7 +12834,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB11_16 ; GFX1150-NEXT: .LBB11_10: ; GFX1150-NEXT: ; implicit-def: $vgpr1 -; GFX1150-NEXT: .LBB11_11: ; %frem.compute15 +; GFX1150-NEXT: .LBB11_11: ; %frem.compute ; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s2| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s5| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 @@ -12869,11 +12869,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-NEXT: s_sub_i32 s7, s7, s8 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s7, s7, 12 -; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v6, v3 @@ -12895,7 +12895,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.14: ; %Flow ; GFX1150-NEXT: v_mov_b32_e32 v5, s7 ; GFX1150-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5 @@ -12950,7 +12950,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8 ; GFX1200-NEXT: s_cbranch_scc0 .LBB11_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: ; %bb.1: ; %frem.else16 ; GFX1200-NEXT: s_cmp_eq_f32 s3, s8 ; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -12960,7 +12960,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB11_8 ; GFX1200-NEXT: .LBB11_2: ; GFX1200-NEXT: ; implicit-def: $vgpr0 -; GFX1200-NEXT: .LBB11_3: ; %frem.compute +; GFX1200-NEXT: .LBB11_3: ; %frem.compute15 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s6| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -12996,11 +12996,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 -; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body +; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v5, v2 @@ -13024,7 +13024,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.6: ; %Flow51 ; GFX1200-NEXT: v_mov_b32_e32 v4, s7 ; GFX1200-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 @@ -13048,7 +13048,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s6, s8 ; GFX1200-NEXT: s_cbranch_scc0 .LBB11_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: ; %bb.9: ; %frem.else ; GFX1200-NEXT: s_cmp_eq_f32 s6, s8 ; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -13059,7 +13059,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB11_16 ; GFX1200-NEXT: .LBB11_10: ; GFX1200-NEXT: ; implicit-def: $vgpr1 -; GFX1200-NEXT: .LBB11_11: ; %frem.compute15 +; GFX1200-NEXT: .LBB11_11: ; %frem.compute ; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s2| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s5| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 @@ -13095,11 +13095,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 -; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v6, v3 @@ -13123,7 +13123,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.14: ; %Flow ; GFX1200-NEXT: v_mov_b32_e32 v5, s7 ; GFX1200-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 @@ -13187,7 +13187,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else78 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v8, s2, 0, v0 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -13198,7 +13198,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_2: ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_3: ; %frem.compute +; SI-NEXT: .LBB12_3: ; %frem.compute77 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v0 @@ -13234,10 +13234,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_5: ; %frem.loop_body +; SI-NEXT: .LBB12_5: ; %frem.loop_body85 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v11, v9 ; SI-NEXT: v_mul_f32_e32 v9, v11, v10 @@ -13252,7 +13252,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_5 ; SI-NEXT: ; %bb.6: ; %Flow125 ; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: .LBB12_7: ; %frem.loop_exit +; SI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v9, v9, s3 ; SI-NEXT: v_mul_f32_e32 v10, v9, v10 @@ -13268,7 +13268,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_10 -; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: ; %bb.9: ; %frem.else47 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v9, s2, 0, v1 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -13279,7 +13279,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_10: ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_11: ; %frem.compute15 +; SI-NEXT: .LBB12_11: ; %frem.compute46 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v1 @@ -13315,10 +13315,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_13: ; %frem.loop_body23 +; SI-NEXT: .LBB12_13: ; %frem.loop_body54 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v12, v10 ; SI-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -13333,7 +13333,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_13 ; SI-NEXT: ; %bb.14: ; %Flow121 ; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; SI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v10, v10, s3 ; SI-NEXT: v_mul_f32_e32 v11, v10, v11 @@ -13349,7 +13349,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_18 -; SI-NEXT: ; %bb.17: ; %frem.else47 +; SI-NEXT: ; %bb.17: ; %frem.else16 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v10, s2, 0, v2 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -13360,7 +13360,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_18: ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_19: ; %frem.compute46 +; SI-NEXT: .LBB12_19: ; %frem.compute15 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2 @@ -13396,10 +13396,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_23 -; SI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; SI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_21: ; %frem.loop_body54 +; SI-NEXT: .LBB12_21: ; %frem.loop_body23 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v13, v11 ; SI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -13414,7 +13414,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_21 ; SI-NEXT: ; %bb.22: ; %Flow117 ; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; SI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v11, v11, s3 ; SI-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -13430,7 +13430,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_26 -; SI-NEXT: ; %bb.25: ; %frem.else78 +; SI-NEXT: ; %bb.25: ; %frem.else ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v11, s2, 0, v3 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -13441,7 +13441,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_26: ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_27: ; %frem.compute77 +; SI-NEXT: .LBB12_27: ; %frem.compute ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3 @@ -13477,10 +13477,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_31 -; SI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_29: ; %frem.loop_body85 +; SI-NEXT: .LBB12_29: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v14, v12 ; SI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -13495,7 +13495,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_29 ; SI-NEXT: ; %bb.30: ; %Flow ; SI-NEXT: v_mov_b32_e32 v12, v14 -; SI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; SI-NEXT: .LBB12_31: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v12, v12, s3 ; SI-NEXT: v_mul_f32_e32 v13, v12, v13 @@ -13548,7 +13548,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else78 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v8, s2, 0, v0 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -13557,7 +13557,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_8 ; CI-NEXT: .LBB12_2: ; CI-NEXT: ; implicit-def: $vgpr8 -; CI-NEXT: .LBB12_3: ; %frem.compute +; CI-NEXT: .LBB12_3: ; %frem.compute77 ; CI-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; CI-NEXT: v_ldexp_f32_e64 v9, v9, 1 ; CI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 @@ -13582,10 +13582,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 ; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 ; CI-NEXT: v_add_i32_e32 v10, vcc, 12, v10 -; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: .LBB12_5: ; %frem.loop_body85 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v13, v11 ; CI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -13600,7 +13600,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_5 ; CI-NEXT: ; %bb.6: ; %Flow125 ; CI-NEXT: v_mov_b32_e32 v11, v13 -; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10 ; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 ; CI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -13616,7 +13616,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else47 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v9, s2, 0, v1 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -13625,7 +13625,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_16 ; CI-NEXT: .LBB12_10: ; CI-NEXT: ; implicit-def: $vgpr9 -; CI-NEXT: .LBB12_11: ; %frem.compute15 +; CI-NEXT: .LBB12_11: ; %frem.compute46 ; CI-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; CI-NEXT: v_ldexp_f32_e64 v10, v10, 1 ; CI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 @@ -13650,10 +13650,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 ; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 ; CI-NEXT: v_add_i32_e32 v11, vcc, 12, v11 -; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: .LBB12_13: ; %frem.loop_body54 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v14, v12 ; CI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -13668,7 +13668,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_13 ; CI-NEXT: ; %bb.14: ; %Flow121 ; CI-NEXT: v_mov_b32_e32 v12, v14 -; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11 ; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 ; CI-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -13684,7 +13684,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_18 -; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: ; %bb.17: ; %frem.else16 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v10, s2, 0, v2 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -13693,7 +13693,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_24 ; CI-NEXT: .LBB12_18: ; CI-NEXT: ; implicit-def: $vgpr10 -; CI-NEXT: .LBB12_19: ; %frem.compute46 +; CI-NEXT: .LBB12_19: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; CI-NEXT: v_ldexp_f32_e64 v11, v11, 1 ; CI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 @@ -13718,10 +13718,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 ; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_23 -; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 ; CI-NEXT: v_add_i32_e32 v12, vcc, 12, v12 -; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: .LBB12_21: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v15, v13 ; CI-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -13736,7 +13736,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_21 ; CI-NEXT: ; %bb.22: ; %Flow117 ; CI-NEXT: v_mov_b32_e32 v13, v15 -; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12 ; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 ; CI-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -13752,7 +13752,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_26 -; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v11, s2, 0, v3 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -13761,7 +13761,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_32 ; CI-NEXT: .LBB12_26: ; CI-NEXT: ; implicit-def: $vgpr11 -; CI-NEXT: .LBB12_27: ; %frem.compute77 +; CI-NEXT: .LBB12_27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; CI-NEXT: v_ldexp_f32_e64 v12, v12, 1 ; CI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 @@ -13786,10 +13786,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 ; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_31 -; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 ; CI-NEXT: v_add_i32_e32 v13, vcc, 12, v13 -; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: .LBB12_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v16, v14 ; CI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -13804,7 +13804,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_29 ; CI-NEXT: ; %bb.30: ; %Flow ; CI-NEXT: v_mov_b32_e32 v14, v16 -; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13 ; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 ; CI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -13857,7 +13857,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else78 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v8, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -13866,7 +13866,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_8 ; VI-NEXT: .LBB12_2: ; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: .LBB12_3: ; %frem.compute +; VI-NEXT: .LBB12_3: ; %frem.compute77 ; VI-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; VI-NEXT: v_ldexp_f32 v9, v9, 1 ; VI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 @@ -13891,10 +13891,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 ; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14 ; VI-NEXT: v_add_u32_e32 v10, vcc, 12, v10 -; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: .LBB12_5: ; %frem.loop_body85 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v13, v11 ; VI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -13909,7 +13909,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_5 ; VI-NEXT: ; %bb.6: ; %Flow125 ; VI-NEXT: v_mov_b32_e32 v11, v13 -; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; VI-NEXT: v_add_u32_e32 v10, vcc, -11, v10 ; VI-NEXT: v_ldexp_f32 v10, v11, v10 ; VI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -13925,7 +13925,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else47 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v9, s2, 0, v1 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -13934,7 +13934,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_16 ; VI-NEXT: .LBB12_10: ; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: .LBB12_11: ; %frem.compute15 +; VI-NEXT: .LBB12_11: ; %frem.compute46 ; VI-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; VI-NEXT: v_ldexp_f32 v10, v10, 1 ; VI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 @@ -13959,10 +13959,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 ; VI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; VI-NEXT: v_sub_u32_e32 v11, vcc, v14, v15 ; VI-NEXT: v_add_u32_e32 v11, vcc, 12, v11 -; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: .LBB12_13: ; %frem.loop_body54 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v14, v12 ; VI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -13977,7 +13977,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_13 ; VI-NEXT: ; %bb.14: ; %Flow121 ; VI-NEXT: v_mov_b32_e32 v12, v14 -; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; VI-NEXT: v_add_u32_e32 v11, vcc, -11, v11 ; VI-NEXT: v_ldexp_f32 v11, v12, v11 ; VI-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -13993,7 +13993,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_18 -; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: ; %bb.17: ; %frem.else16 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v10, s2, 0, v2 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -14002,7 +14002,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_24 ; VI-NEXT: .LBB12_18: ; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: .LBB12_19: ; %frem.compute46 +; VI-NEXT: .LBB12_19: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; VI-NEXT: v_ldexp_f32 v11, v11, 1 ; VI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 @@ -14027,10 +14027,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 ; VI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_23 -; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; VI-NEXT: v_sub_u32_e32 v12, vcc, v15, v16 ; VI-NEXT: v_add_u32_e32 v12, vcc, 12, v12 -; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: .LBB12_21: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v15, v13 ; VI-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -14045,7 +14045,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_21 ; VI-NEXT: ; %bb.22: ; %Flow117 ; VI-NEXT: v_mov_b32_e32 v13, v15 -; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v12, vcc, -11, v12 ; VI-NEXT: v_ldexp_f32 v12, v13, v12 ; VI-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -14061,7 +14061,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_26 -; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v11, s2, 0, v3 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -14070,7 +14070,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_32 ; VI-NEXT: .LBB12_26: ; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB12_27: ; %frem.compute77 +; VI-NEXT: .LBB12_27: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; VI-NEXT: v_ldexp_f32 v12, v12, 1 ; VI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 @@ -14095,10 +14095,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 ; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_31 -; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17 ; VI-NEXT: v_add_u32_e32 v13, vcc, 12, v13 -; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: .LBB12_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v16, v14 ; VI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -14113,7 +14113,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_29 ; VI-NEXT: ; %bb.30: ; %Flow ; VI-NEXT: v_mov_b32_e32 v14, v16 -; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v13, vcc, -11, v13 ; VI-NEXT: v_ldexp_f32 v13, v14, v13 ; VI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -14161,7 +14161,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else78 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v8, s2, 0, v0 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -14170,7 +14170,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_8 ; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: ; implicit-def: $vgpr8 -; GFX9-NEXT: .LBB12_3: ; %frem.compute +; GFX9-NEXT: .LBB12_3: ; %frem.compute77 ; GFX9-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; GFX9-NEXT: v_ldexp_f32 v9, v9, 1 ; GFX9-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 @@ -14195,10 +14195,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 ; GFX9-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX9-NEXT: v_sub_u32_e32 v10, v13, v14 ; GFX9-NEXT: v_add_u32_e32 v10, 12, v10 -; GFX9-NEXT: .LBB12_5: ; %frem.loop_body +; GFX9-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -14213,7 +14213,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_5 ; GFX9-NEXT: ; %bb.6: ; %Flow125 ; GFX9-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX9-NEXT: v_add_u32_e32 v10, -11, v10 ; GFX9-NEXT: v_ldexp_f32 v10, v11, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -14229,7 +14229,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: ; %bb.9: ; %frem.else47 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v1 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -14238,7 +14238,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_16 ; GFX9-NEXT: .LBB12_10: ; GFX9-NEXT: ; implicit-def: $vgpr9 -; GFX9-NEXT: .LBB12_11: ; %frem.compute15 +; GFX9-NEXT: .LBB12_11: ; %frem.compute46 ; GFX9-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; GFX9-NEXT: v_ldexp_f32 v10, v10, 1 ; GFX9-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 @@ -14263,10 +14263,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 ; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15 ; GFX9-NEXT: v_add_u32_e32 v11, 12, v11 -; GFX9-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX9-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -14281,7 +14281,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_13 ; GFX9-NEXT: ; %bb.14: ; %Flow121 ; GFX9-NEXT: v_mov_b32_e32 v12, v14 -; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX9-NEXT: v_add_u32_e32 v11, -11, v11 ; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -14297,7 +14297,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_18 -; GFX9-NEXT: ; %bb.17: ; %frem.else47 +; GFX9-NEXT: ; %bb.17: ; %frem.else16 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v10, s2, 0, v2 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -14306,7 +14306,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_24 ; GFX9-NEXT: .LBB12_18: ; GFX9-NEXT: ; implicit-def: $vgpr10 -; GFX9-NEXT: .LBB12_19: ; %frem.compute46 +; GFX9-NEXT: .LBB12_19: ; %frem.compute15 ; GFX9-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; GFX9-NEXT: v_ldexp_f32 v11, v11, 1 ; GFX9-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 @@ -14331,10 +14331,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 ; GFX9-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX9-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX9-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX9-NEXT: v_sub_u32_e32 v12, v15, v16 ; GFX9-NEXT: v_add_u32_e32 v12, 12, v12 -; GFX9-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX9-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v15, v13 ; GFX9-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -14349,7 +14349,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_21 ; GFX9-NEXT: ; %bb.22: ; %Flow117 ; GFX9-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX9-NEXT: v_add_u32_e32 v12, -11, v12 ; GFX9-NEXT: v_ldexp_f32 v12, v13, v12 ; GFX9-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -14365,7 +14365,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_26 -; GFX9-NEXT: ; %bb.25: ; %frem.else78 +; GFX9-NEXT: ; %bb.25: ; %frem.else ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v11, s2, 0, v3 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -14374,7 +14374,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_32 ; GFX9-NEXT: .LBB12_26: ; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: .LBB12_27: ; %frem.compute77 +; GFX9-NEXT: .LBB12_27: ; %frem.compute ; GFX9-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; GFX9-NEXT: v_ldexp_f32 v12, v12, 1 ; GFX9-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 @@ -14399,10 +14399,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 ; GFX9-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX9-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v13, v16, v17 ; GFX9-NEXT: v_add_u32_e32 v13, 12, v13 -; GFX9-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX9-NEXT: .LBB12_29: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v16, v14 ; GFX9-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -14417,7 +14417,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_29 ; GFX9-NEXT: ; %bb.30: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v13, -11, v13 ; GFX9-NEXT: v_ldexp_f32 v13, v14, v13 ; GFX9-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -14466,7 +14466,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else78 ; GFX10-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc_lo @@ -14474,7 +14474,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_8 ; GFX10-NEXT: .LBB12_2: ; GFX10-NEXT: ; implicit-def: $vgpr8 -; GFX10-NEXT: .LBB12_3: ; %frem.compute +; GFX10-NEXT: .LBB12_3: ; %frem.compute77 ; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; GFX10-NEXT: v_frexp_mant_f32_e64 v8, |v0| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v11, v0 @@ -14501,10 +14501,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v12 ; GFX10-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_5: ; %frem.loop_body +; GFX10-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v13, v10 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14520,7 +14520,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow125 ; GFX10-NEXT: v_mov_b32_e32 v12, s2 ; GFX10-NEXT: v_mov_b32_e32 v10, v13 -; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX10-NEXT: v_add_nc_u32_e32 v12, -11, v12 ; GFX10-NEXT: v_ldexp_f32 v10, v10, v12 ; GFX10-NEXT: v_mul_f32_e32 v11, v10, v11 @@ -14535,7 +14535,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: ; %bb.9: ; %frem.else47 ; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5| ; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc_lo @@ -14543,7 +14543,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_16 ; GFX10-NEXT: .LBB12_10: ; GFX10-NEXT: ; implicit-def: $vgpr9 -; GFX10-NEXT: .LBB12_11: ; %frem.compute15 +; GFX10-NEXT: .LBB12_11: ; %frem.compute46 ; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v1| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v1 @@ -14570,10 +14570,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v13 ; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX10-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14589,7 +14589,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow121 ; GFX10-NEXT: v_mov_b32_e32 v13, s2 ; GFX10-NEXT: v_mov_b32_e32 v11, v14 -; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -11, v13 ; GFX10-NEXT: v_ldexp_f32 v11, v11, v13 ; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -14604,7 +14604,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_18 -; GFX10-NEXT: ; %bb.17: ; %frem.else47 +; GFX10-NEXT: ; %bb.17: ; %frem.else16 ; GFX10-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6| ; GFX10-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc_lo @@ -14612,7 +14612,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_24 ; GFX10-NEXT: .LBB12_18: ; GFX10-NEXT: ; implicit-def: $vgpr10 -; GFX10-NEXT: .LBB12_19: ; %frem.compute46 +; GFX10-NEXT: .LBB12_19: ; %frem.compute15 ; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v2| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v13, v2 @@ -14639,10 +14639,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v14 ; GFX10-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX10-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX10-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX10-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v15, v12 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14658,7 +14658,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.22: ; %Flow117 ; GFX10-NEXT: v_mov_b32_e32 v14, s2 ; GFX10-NEXT: v_mov_b32_e32 v12, v15 -; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX10-NEXT: v_add_nc_u32_e32 v14, -11, v14 ; GFX10-NEXT: v_ldexp_f32 v12, v12, v14 ; GFX10-NEXT: v_mul_f32_e32 v13, v12, v13 @@ -14673,7 +14673,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_26 -; GFX10-NEXT: ; %bb.25: ; %frem.else78 +; GFX10-NEXT: ; %bb.25: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7| ; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc_lo @@ -14681,7 +14681,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_32 ; GFX10-NEXT: .LBB12_26: ; GFX10-NEXT: ; implicit-def: $vgpr11 -; GFX10-NEXT: .LBB12_27: ; %frem.compute77 +; GFX10-NEXT: .LBB12_27: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v3| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v14, v3 @@ -14708,10 +14708,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v15 ; GFX10-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX10-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX10-NEXT: .LBB12_29: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v16, v13 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14727,7 +14727,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.30: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v15, s2 ; GFX10-NEXT: v_mov_b32_e32 v13, v16 -; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v15, -11, v15 ; GFX10-NEXT: v_ldexp_f32 v13, v13, v15 ; GFX10-NEXT: v_mul_f32_e32 v14, v13, v14 @@ -14773,7 +14773,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: ; %bb.1: ; %frem.else78 ; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14782,7 +14782,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_8 ; GFX11-NEXT: .LBB12_2: ; GFX11-NEXT: ; implicit-def: $vgpr8 -; GFX11-NEXT: .LBB12_3: ; %frem.compute +; GFX11-NEXT: .LBB12_3: ; %frem.compute77 ; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; GFX11-NEXT: v_frexp_mant_f32_e64 v8, |v0| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v11, v0 @@ -14818,11 +14818,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_5: ; %frem.loop_body +; GFX11-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v13, v10 @@ -14842,7 +14842,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.6: ; %Flow125 ; GFX11-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-NEXT: v_mov_b32_e32 v10, v13 -; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v12, -11, v12 ; GFX11-NEXT: v_ldexp_f32 v10, v10, v12 @@ -14862,7 +14862,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: ; %bb.9: ; %frem.else47 ; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14871,7 +14871,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_16 ; GFX11-NEXT: .LBB12_10: ; GFX11-NEXT: ; implicit-def: $vgpr9 -; GFX11-NEXT: .LBB12_11: ; %frem.compute15 +; GFX11-NEXT: .LBB12_11: ; %frem.compute46 ; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v1| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v12, v1 @@ -14907,11 +14907,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX11-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v14, v11 @@ -14931,7 +14931,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.14: ; %Flow121 ; GFX11-NEXT: v_mov_b32_e32 v13, s2 ; GFX11-NEXT: v_mov_b32_e32 v11, v14 -; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v13, -11, v13 ; GFX11-NEXT: v_ldexp_f32 v11, v11, v13 @@ -14951,7 +14951,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_18 -; GFX11-NEXT: ; %bb.17: ; %frem.else47 +; GFX11-NEXT: ; %bb.17: ; %frem.else16 ; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14960,7 +14960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_24 ; GFX11-NEXT: .LBB12_18: ; GFX11-NEXT: ; implicit-def: $vgpr10 -; GFX11-NEXT: .LBB12_19: ; %frem.compute46 +; GFX11-NEXT: .LBB12_19: ; %frem.compute15 ; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v2| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v13, v2 @@ -14996,11 +14996,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX11-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX11-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX11-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v15, v12 @@ -15020,7 +15020,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.22: ; %Flow117 ; GFX11-NEXT: v_mov_b32_e32 v14, s2 ; GFX11-NEXT: v_mov_b32_e32 v12, v15 -; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v14, -11, v14 ; GFX11-NEXT: v_ldexp_f32 v12, v12, v14 @@ -15040,7 +15040,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_26 -; GFX11-NEXT: ; %bb.25: ; %frem.else78 +; GFX11-NEXT: ; %bb.25: ; %frem.else ; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -15049,7 +15049,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_32 ; GFX11-NEXT: .LBB12_26: ; GFX11-NEXT: ; implicit-def: $vgpr11 -; GFX11-NEXT: .LBB12_27: ; %frem.compute77 +; GFX11-NEXT: .LBB12_27: ; %frem.compute ; GFX11-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v3| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v14, v3 @@ -15085,11 +15085,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX11-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX11-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX11-NEXT: .LBB12_29: ; %frem.loop_body ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v16, v13 @@ -15109,7 +15109,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.30: ; %Flow ; GFX11-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-NEXT: v_mov_b32_e32 v13, v16 -; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v15, -11, v15 ; GFX11-NEXT: v_ldexp_f32 v13, v13, v15 @@ -15170,7 +15170,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s5, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: ; %bb.1: ; %frem.else78 ; GFX1150-NEXT: s_cmp_eq_f32 s5, s12 ; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15180,7 +15180,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_8 ; GFX1150-NEXT: .LBB12_2: ; GFX1150-NEXT: ; implicit-def: $vgpr0 -; GFX1150-NEXT: .LBB12_3: ; %frem.compute +; GFX1150-NEXT: .LBB12_3: ; %frem.compute77 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s6| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s8| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -15215,11 +15215,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body +; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v5, v2 @@ -15241,7 +15241,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.6: ; %Flow125 ; GFX1150-NEXT: v_mov_b32_e32 v4, s11 ; GFX1150-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4 @@ -15264,7 +15264,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s8, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: ; %bb.9: ; %frem.else47 ; GFX1150-NEXT: s_cmp_eq_f32 s8, s12 ; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15274,7 +15274,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_16 ; GFX1150-NEXT: .LBB12_10: ; GFX1150-NEXT: ; implicit-def: $vgpr1 -; GFX1150-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1150-NEXT: .LBB12_11: ; %frem.compute46 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s4| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s10| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -15309,11 +15309,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v6, v3 @@ -15335,7 +15335,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.14: ; %Flow121 ; GFX1150-NEXT: v_mov_b32_e32 v5, s11 ; GFX1150-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5 @@ -15358,7 +15358,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s10, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_18 -; GFX1150-NEXT: ; %bb.17: ; %frem.else47 +; GFX1150-NEXT: ; %bb.17: ; %frem.else16 ; GFX1150-NEXT: s_cmp_eq_f32 s10, s12 ; GFX1150-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15368,7 +15368,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_24 ; GFX1150-NEXT: .LBB12_18: ; GFX1150-NEXT: ; implicit-def: $vgpr2 -; GFX1150-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1150-NEXT: .LBB12_19: ; %frem.compute15 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s3| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 @@ -15403,11 +15403,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 ; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v7, v4 @@ -15429,7 +15429,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.22: ; %Flow117 ; GFX1150-NEXT: v_mov_b32_e32 v6, s11 ; GFX1150-NEXT: v_mov_b32_e32 v4, v7 -; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v6, -11, v6 ; GFX1150-NEXT: v_ldexp_f32 v4, v4, v6 @@ -15452,7 +15452,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s9, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_26 -; GFX1150-NEXT: ; %bb.25: ; %frem.else78 +; GFX1150-NEXT: ; %bb.25: ; %frem.else ; GFX1150-NEXT: s_cmp_eq_f32 s9, s12 ; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15462,7 +15462,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_32 ; GFX1150-NEXT: .LBB12_26: ; GFX1150-NEXT: ; implicit-def: $vgpr3 -; GFX1150-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1150-NEXT: .LBB12_27: ; %frem.compute ; GFX1150-NEXT: v_frexp_mant_f32_e64 v4, |s2| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s7| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 @@ -15497,11 +15497,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 ; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v8, v5 @@ -15523,7 +15523,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.30: ; %Flow ; GFX1150-NEXT: v_mov_b32_e32 v7, s11 ; GFX1150-NEXT: v_mov_b32_e32 v5, v8 -; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v7, -11, v7 ; GFX1150-NEXT: v_ldexp_f32 v5, v5, v7 @@ -15597,7 +15597,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_cmp_ngt_f32 s5, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: ; %bb.1: ; %frem.else78 ; GFX1200-NEXT: s_cmp_eq_f32 s5, s12 ; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15607,7 +15607,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_8 ; GFX1200-NEXT: .LBB12_2: ; GFX1200-NEXT: ; implicit-def: $vgpr0 -; GFX1200-NEXT: .LBB12_3: ; %frem.compute +; GFX1200-NEXT: .LBB12_3: ; %frem.compute77 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s6| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s8| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -15643,11 +15643,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body +; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: v_mov_b32_e32 v5, v2 @@ -15670,7 +15670,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.6: ; %Flow125 ; GFX1200-NEXT: v_mov_b32_e32 v4, s11 ; GFX1200-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 @@ -15694,7 +15694,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s8, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: ; %bb.9: ; %frem.else47 ; GFX1200-NEXT: s_cmp_eq_f32 s8, s12 ; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15705,7 +15705,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_16 ; GFX1200-NEXT: .LBB12_10: ; GFX1200-NEXT: ; implicit-def: $vgpr1 -; GFX1200-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1200-NEXT: .LBB12_11: ; %frem.compute46 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s4| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s10| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -15741,11 +15741,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v6, v3 @@ -15769,7 +15769,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.14: ; %Flow121 ; GFX1200-NEXT: v_mov_b32_e32 v5, s11 ; GFX1200-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 @@ -15793,7 +15793,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s10, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18 -; GFX1200-NEXT: ; %bb.17: ; %frem.else47 +; GFX1200-NEXT: ; %bb.17: ; %frem.else16 ; GFX1200-NEXT: s_cmp_eq_f32 s10, s12 ; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15804,7 +15804,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_24 ; GFX1200-NEXT: .LBB12_18: ; GFX1200-NEXT: ; implicit-def: $vgpr2 -; GFX1200-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1200-NEXT: .LBB12_19: ; %frem.compute15 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s3| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 @@ -15840,11 +15840,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 ; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v7, v4 @@ -15868,7 +15868,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.22: ; %Flow117 ; GFX1200-NEXT: v_mov_b32_e32 v6, s11 ; GFX1200-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v6, -11, v6 ; GFX1200-NEXT: v_ldexp_f32 v4, v4, v6 @@ -15892,7 +15892,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s9, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26 -; GFX1200-NEXT: ; %bb.25: ; %frem.else78 +; GFX1200-NEXT: ; %bb.25: ; %frem.else ; GFX1200-NEXT: s_cmp_eq_f32 s9, s12 ; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15903,7 +15903,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_32 ; GFX1200-NEXT: .LBB12_26: ; GFX1200-NEXT: ; implicit-def: $vgpr3 -; GFX1200-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1200-NEXT: .LBB12_27: ; %frem.compute ; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s2| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s7| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 @@ -15939,11 +15939,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 ; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v8, v5 @@ -15967,7 +15967,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.30: ; %Flow ; GFX1200-NEXT: v_mov_b32_e32 v7, s11 ; GFX1200-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7 ; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7 @@ -16048,7 +16048,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| ; SI-NEXT: s_and_b64 vcc, exec, s[0:1] ; SI-NEXT: s_cbranch_vccz .LBB13_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else16 ; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; SI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16059,7 +16059,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB13_2: ; SI-NEXT: ; implicit-def: $vgpr8_vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB13_3: ; %frem.compute +; SI-NEXT: .LBB13_3: ; %frem.compute15 ; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v1 ; SI-NEXT: s_mov_b32 s0, 0 @@ -16105,13 +16105,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; SI-NEXT: s_cmp_lt_i32 s6, 27 ; SI-NEXT: s_cbranch_scc1 .LBB13_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; SI-NEXT: s_sub_i32 s0, s3, s7 ; SI-NEXT: s_add_i32 s6, s0, 26 ; SI-NEXT: s_mov_b32 s3, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v18, 0x43300000 ; SI-NEXT: v_mov_b32_e32 v14, 0 -; SI-NEXT: .LBB13_5: ; %frem.loop_body +; SI-NEXT: .LBB13_5: ; %frem.loop_body23 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v17, v11 ; SI-NEXT: v_mov_b32_e32 v16, v10 @@ -16134,7 +16134,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: ; %bb.6: ; %Flow51 ; SI-NEXT: v_mov_b32_e32 v10, v16 ; SI-NEXT: v_mov_b32_e32 v11, v17 -; SI-NEXT: .LBB13_7: ; %frem.loop_exit +; SI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; SI-NEXT: s_sub_i32 s0, s6, 25 ; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], s0 ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] @@ -16160,7 +16160,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]| ; SI-NEXT: s_and_b64 vcc, exec, s[0:1] ; SI-NEXT: s_cbranch_vccz .LBB13_10 -; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: ; %bb.9: ; %frem.else ; SI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; SI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16171,7 +16171,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB13_10: ; SI-NEXT: ; implicit-def: $vgpr10_vgpr11 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB13_11: ; %frem.compute15 +; SI-NEXT: .LBB13_11: ; %frem.compute ; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: v_and_b32_e32 v12, 0x7fffffff, v3 ; SI-NEXT: s_mov_b32 s0, 0 @@ -16217,13 +16217,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; SI-NEXT: s_cmp_lt_i32 s6, 27 ; SI-NEXT: s_cbranch_scc1 .LBB13_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s0, s3, s7 ; SI-NEXT: s_add_i32 s6, s0, 26 ; SI-NEXT: s_mov_b32 s3, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v20, 0x43300000 ; SI-NEXT: v_mov_b32_e32 v16, 0 -; SI-NEXT: .LBB13_13: ; %frem.loop_body23 +; SI-NEXT: .LBB13_13: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v19, v13 ; SI-NEXT: v_mov_b32_e32 v18, v12 @@ -16246,7 +16246,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: ; %bb.14: ; %Flow ; SI-NEXT: v_mov_b32_e32 v12, v18 ; SI-NEXT: v_mov_b32_e32 v13, v19 -; SI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; SI-NEXT: .LBB13_15: ; %frem.loop_exit ; SI-NEXT: s_sub_i32 s0, s6, 25 ; SI-NEXT: v_ldexp_f64 v[12:13], v[12:13], s0 ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] @@ -16304,7 +16304,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB13_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; CI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16313,7 +16313,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB13_8 ; CI-NEXT: .LBB13_2: ; CI-NEXT: ; implicit-def: $vgpr8_vgpr9 -; CI-NEXT: .LBB13_3: ; %frem.compute +; CI-NEXT: .LBB13_3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; CI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] ; CI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] @@ -16337,10 +16337,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 ; CI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_sub_i32_e32 v14, vcc, v14, v15 ; CI-NEXT: v_add_i32_e32 v17, vcc, 26, v14 -; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: .LBB13_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v15, v11 ; CI-NEXT: v_mov_b32_e32 v14, v10 @@ -16358,7 +16358,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; %bb.6: ; %Flow51 ; CI-NEXT: v_mov_b32_e32 v10, v14 ; CI-NEXT: v_mov_b32_e32 v11, v15 -; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; CI-NEXT: v_subrev_i32_e32 v14, vcc, 25, v17 ; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; CI-NEXT: s_brev_b32 s2, -2 @@ -16375,7 +16375,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB13_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; CI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; CI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16384,7 +16384,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB13_16 ; CI-NEXT: .LBB13_10: ; CI-NEXT: ; implicit-def: $vgpr10_vgpr11 -; CI-NEXT: .LBB13_11: ; %frem.compute15 +; CI-NEXT: .LBB13_11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; CI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] ; CI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] @@ -16408,10 +16408,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 ; CI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 ; CI-NEXT: v_add_i32_e32 v19, vcc, 26, v16 -; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: .LBB13_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v17, v13 ; CI-NEXT: v_mov_b32_e32 v16, v12 @@ -16429,7 +16429,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; %bb.14: ; %Flow ; CI-NEXT: v_mov_b32_e32 v12, v16 ; CI-NEXT: v_mov_b32_e32 v13, v17 -; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit ; CI-NEXT: v_subrev_i32_e32 v16, vcc, 25, v19 ; CI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; CI-NEXT: s_brev_b32 s2, -2 @@ -16478,7 +16478,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; VI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; VI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16487,7 +16487,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB13_8 ; VI-NEXT: .LBB13_2: ; VI-NEXT: ; implicit-def: $vgpr8_vgpr9 -; VI-NEXT: .LBB13_3: ; %frem.compute +; VI-NEXT: .LBB13_3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; VI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] ; VI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] @@ -16511,10 +16511,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 ; VI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_sub_u32_e32 v14, vcc, v14, v15 ; VI-NEXT: v_add_u32_e32 v17, vcc, 26, v14 -; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: .LBB13_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v15, v11 ; VI-NEXT: v_mov_b32_e32 v14, v10 @@ -16532,7 +16532,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; %bb.6: ; %Flow51 ; VI-NEXT: v_mov_b32_e32 v10, v14 ; VI-NEXT: v_mov_b32_e32 v11, v15 -; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; VI-NEXT: v_subrev_u32_e32 v14, vcc, 25, v17 ; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; VI-NEXT: s_brev_b32 s2, -2 @@ -16549,7 +16549,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB13_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; VI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; VI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16558,7 +16558,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB13_16 ; VI-NEXT: .LBB13_10: ; VI-NEXT: ; implicit-def: $vgpr10_vgpr11 -; VI-NEXT: .LBB13_11: ; %frem.compute15 +; VI-NEXT: .LBB13_11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; VI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] ; VI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] @@ -16582,10 +16582,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 ; VI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 ; VI-NEXT: v_add_u32_e32 v19, vcc, 26, v16 -; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: .LBB13_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v17, v13 ; VI-NEXT: v_mov_b32_e32 v16, v12 @@ -16603,7 +16603,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; %bb.14: ; %Flow ; VI-NEXT: v_mov_b32_e32 v12, v16 ; VI-NEXT: v_mov_b32_e32 v13, v17 -; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit ; VI-NEXT: v_subrev_u32_e32 v16, vcc, 25, v19 ; VI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; VI-NEXT: s_brev_b32 s2, -2 @@ -16647,7 +16647,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else16 ; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16656,7 +16656,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB13_8 ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX9-NEXT: .LBB13_3: ; %frem.compute +; GFX9-NEXT: .LBB13_3: ; %frem.compute15 ; GFX9-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] @@ -16680,10 +16680,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 ; GFX9-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX9-NEXT: v_sub_u32_e32 v14, v14, v15 ; GFX9-NEXT: v_add_u32_e32 v17, 26, v14 -; GFX9-NEXT: .LBB13_5: ; %frem.loop_body +; GFX9-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-NEXT: v_mov_b32_e32 v14, v10 @@ -16701,7 +16701,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: ; %bb.6: ; %Flow51 ; GFX9-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX9-NEXT: v_subrev_u32_e32 v14, 25, v17 ; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; GFX9-NEXT: s_brev_b32 s2, -2 @@ -16718,7 +16718,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB13_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: ; %bb.9: ; %frem.else ; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16727,7 +16727,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB13_16 ; GFX9-NEXT: .LBB13_10: ; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX9-NEXT: .LBB13_11: ; %frem.compute15 +; GFX9-NEXT: .LBB13_11: ; %frem.compute ; GFX9-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] @@ -16751,10 +16751,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 ; GFX9-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v16, v16, v17 ; GFX9-NEXT: v_add_u32_e32 v19, 26, v16 -; GFX9-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX9-NEXT: .LBB13_13: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_mov_b32_e32 v16, v12 @@ -16772,7 +16772,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: ; %bb.14: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v12, v16 ; GFX9-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX9-NEXT: v_subrev_u32_e32 v16, 25, v19 ; GFX9-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; GFX9-NEXT: s_brev_b32 s2, -2 @@ -16817,7 +16817,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB13_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else16 ; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX10-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo @@ -16826,7 +16826,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB13_8 ; GFX10-NEXT: .LBB13_2: ; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX10-NEXT: .LBB13_3: ; %frem.compute +; GFX10-NEXT: .LBB13_3: ; %frem.compute15 ; GFX10-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -16851,10 +16851,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 ; GFX10-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 26 -; GFX10-NEXT: .LBB13_5: ; %frem.loop_body +; GFX10-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; GFX10-NEXT: v_mov_b32_e32 v14, v10 @@ -16873,7 +16873,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_mov_b32_e32 v10, v14 ; GFX10-NEXT: v_mov_b32_e32 v17, s2 ; GFX10-NEXT: v_mov_b32_e32 v11, v15 -; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] @@ -16889,7 +16889,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB13_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: ; %bb.9: ; %frem.else ; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX10-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo @@ -16898,7 +16898,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB13_16 ; GFX10-NEXT: .LBB13_10: ; GFX10-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX10-NEXT: .LBB13_11: ; %frem.compute15 +; GFX10-NEXT: .LBB13_11: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -16923,10 +16923,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 ; GFX10-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 26 -; GFX10-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX10-NEXT: .LBB13_13: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v17, v13 ; GFX10-NEXT: v_mov_b32_e32 v16, v12 @@ -16945,7 +16945,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_mov_b32_e32 v12, v16 ; GFX10-NEXT: v_mov_b32_e32 v19, s2 ; GFX10-NEXT: v_mov_b32_e32 v13, v17 -; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] @@ -16986,7 +16986,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: ; %bb.1: ; %frem.else16 ; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX11-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -16996,7 +16996,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB13_8 ; GFX11-NEXT: .LBB13_2: ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX11-NEXT: .LBB13_3: ; %frem.compute +; GFX11-NEXT: .LBB13_3: ; %frem.compute15 ; GFX11-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -17029,12 +17029,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 26 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_5: ; %frem.loop_body +; GFX11-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 @@ -17054,7 +17054,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.6: ; %Flow51 ; GFX11-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 ; GFX11-NEXT: v_mov_b32_e32 v11, v15 -; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 @@ -17074,7 +17074,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB13_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: ; %bb.9: ; %frem.else ; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX11-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17084,7 +17084,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB13_16 ; GFX11-NEXT: .LBB13_10: ; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX11-NEXT: .LBB13_11: ; %frem.compute15 +; GFX11-NEXT: .LBB13_11: ; %frem.compute ; GFX11-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -17117,12 +17117,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 26 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX11-NEXT: .LBB13_13: ; %frem.loop_body ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 @@ -17142,7 +17142,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.14: ; %Flow ; GFX11-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 ; GFX11-NEXT: v_mov_b32_e32 v13, v17 -; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX11-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 @@ -17187,7 +17187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1150-NEXT: s_cbranch_vccz .LBB13_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: ; %bb.1: ; %frem.else16 ; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX1150-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17197,7 +17197,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB13_8 ; GFX1150-NEXT: .LBB13_2: ; GFX1150-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1150-NEXT: .LBB13_3: ; %frem.compute +; GFX1150-NEXT: .LBB13_3: ; %frem.compute15 ; GFX1150-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -17229,12 +17229,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 ; GFX1150-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1150-NEXT: s_sub_i32 s2, s2, s3 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s2, s2, 26 ; GFX1150-NEXT: .p2align 6 -; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body +; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 @@ -17254,7 +17254,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.6: ; %Flow51 ; GFX1150-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 ; GFX1150-NEXT: v_mov_b32_e32 v11, v15 -; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 @@ -17274,7 +17274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| ; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1150-NEXT: s_cbranch_vccz .LBB13_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: ; %bb.9: ; %frem.else ; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX1150-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17284,7 +17284,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB13_16 ; GFX1150-NEXT: .LBB13_10: ; GFX1150-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1150-NEXT: .LBB13_11: ; %frem.compute15 +; GFX1150-NEXT: .LBB13_11: ; %frem.compute ; GFX1150-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -17316,12 +17316,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 ; GFX1150-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-NEXT: s_sub_i32 s2, s2, s3 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s2, s2, 26 ; GFX1150-NEXT: .p2align 6 -; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 @@ -17341,7 +17341,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.14: ; %Flow ; GFX1150-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 ; GFX1150-NEXT: v_mov_b32_e32 v13, v17 -; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 @@ -17386,7 +17386,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1200-NEXT: s_cbranch_vccz .LBB13_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: ; %bb.1: ; %frem.else16 ; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX1200-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17396,7 +17396,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB13_8 ; GFX1200-NEXT: .LBB13_2: ; GFX1200-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1200-NEXT: .LBB13_3: ; %frem.compute +; GFX1200-NEXT: .LBB13_3: ; %frem.compute15 ; GFX1200-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -17429,11 +17429,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 ; GFX1200-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 -; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body +; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 @@ -17454,7 +17454,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.6: ; %Flow51 ; GFX1200-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 ; GFX1200-NEXT: v_mov_b32_e32 v11, v15 -; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 @@ -17476,7 +17476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cbranch_vccz .LBB13_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: ; %bb.9: ; %frem.else ; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX1200-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX1200-NEXT: s_wait_alu 0xfffd @@ -17487,7 +17487,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB13_16 ; GFX1200-NEXT: .LBB13_10: ; GFX1200-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1200-NEXT: .LBB13_11: ; %frem.compute15 +; GFX1200-NEXT: .LBB13_11: ; %frem.compute ; GFX1200-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -17520,11 +17520,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 ; GFX1200-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 -; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 @@ -17547,7 +17547,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.14: ; %Flow ; GFX1200-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 ; GFX1200-NEXT: v_mov_b32_e32 v13, v17 -; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3c41cc4..5babe9f 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1111,15 +1111,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1186,15 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1281,28 +1274,22 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v8i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.l, v1.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6 -; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[2:3], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v8i8: @@ -1416,44 +1403,34 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v16i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v10.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v6.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14 -; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v16i8: @@ -1649,78 +1626,59 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v6.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v8.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v11.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v12.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v16.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v9.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v8.h ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index f67ab18..234eaa8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4985,21 +4985,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5243,18 +5239,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off -; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off +; GFX11-TRUE16-NEXT: global_store_b8 v[2:3], v4, off +; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 @@ -5528,27 +5520,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[3:4], off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -5994,73 +5980,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v5.h, v5.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off -; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off +; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[9:12], off ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll index 866d3a1..b04602a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll @@ -12,21 +12,21 @@ ; Non-R600 OSes use relocations. ; GCN-DEFAULT: s_getpc_b64 s[[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]] -; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], private1@rel32@lo+4 -; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], private1@rel32@hi+12 +; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], .Lprivate1@rel32@lo+4 +; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], .Lprivate1@rel32@hi+12 ; GCN-DEFAULT: s_getpc_b64 s[[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]] -; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4 -; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+12 +; GCN-DEFAULT: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], .Lprivate2@rel32@lo+4 +; GCN-DEFAULT: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], .Lprivate2@rel32@hi+12 ; MESA uses absolute relocations. -; GCN-MESA: s_add_u32 s2, private1@abs32@lo, s4 -; GCN-MESA: s_addc_u32 s3, private1@abs32@hi, s5 +; GCN-MESA: s_add_u32 s2, .Lprivate1@abs32@lo, s4 +; GCN-MESA: s_addc_u32 s3, .Lprivate1@abs32@hi, s5 ; PAL uses absolute relocations. -; GCN-PAL: s_add_u32 s2, private1@abs32@lo, s4 -; GCN-PAL: s_addc_u32 s3, private1@abs32@hi, s5 -; GCN-PAL: s_add_u32 s4, private2@abs32@lo, s4 -; GCN-PAL: s_addc_u32 s5, private2@abs32@hi, s5 +; GCN-PAL: s_add_u32 s2, .Lprivate1@abs32@lo, s4 +; GCN-PAL: s_addc_u32 s3, .Lprivate1@abs32@hi, s5 +; GCN-PAL: s_add_u32 s4, .Lprivate2@abs32@lo, s4 +; GCN-PAL: s_addc_u32 s5, .Lprivate2@abs32@hi, s5 ; R600-LABEL: private_test define amdgpu_kernel void @private_test(i32 %index, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll index b8cfcbf..6d55e79 100644 --- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll @@ -14,8 +14,8 @@ ; CHECK-LABEL: private_test: ; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private@rel32@lo+8 -; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], private@rel32@hi+16 +; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], .Lprivate@rel32@lo+8 +; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], .Lprivate@rel32@hi+16 ; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]] define amdgpu_kernel void @private_test(ptr addrspace(1) %out) { %ptr = getelementptr [256 x i32], ptr addrspace(1) @private, i32 0, i32 1 @@ -153,7 +153,7 @@ define amdgpu_kernel void @external_w_init_test(ptr addrspace(1) %out) { ret void } -; CHECK: .local private +; CHECK: .local .Lprivate ; CHECK: .local internal ; CHECK: .weak linkonce ; CHECK: .weak weak diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 049663a..f80d50b 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2730,18 +2730,15 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v0.h +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll index 63e9eef..66b7958 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -315,7 +315,7 @@ define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(ptr ad ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4: ; SI: s_getpc_b64 -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, hello.align4@rel32@lo+4 +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, .Lhello.align4@rel32@lo+4 ; SI: s_addc_u32 ; SI-DAG: s_load_dwordx8 ; SI-DAG: s_load_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll index 5ff2d82..2509497 100644 --- a/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll @@ -5,8 +5,8 @@ declare dso_local void @main() define dso_local void @naked() naked "frame-pointer"="all" { ; CHECK-LABEL: naked: -; CHECK: naked$local: -; CHECK-NEXT: .type naked$local,@function +; CHECK: .Lnaked$local: +; CHECK-NEXT: .type .Lnaked$local,@function ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -19,8 +19,8 @@ define dso_local void @naked() naked "frame-pointer"="all" { define dso_local void @normal() "frame-pointer"="all" { ; CHECK-LABEL: normal: -; CHECK: normal$local: -; CHECK-NEXT: .type normal$local,@function +; CHECK: .Lnormal$local: +; CHECK-NEXT: .type .Lnormal$local,@function ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll index d111cf2..50f7d40 100644 --- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll +++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll @@ -284,7 +284,6 @@ define i32 @caller_half_in_fregs() nounwind { ; LA64S-NEXT: addi.d $sp, $sp, -16 ; LA64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64S-NEXT: lu12i.w $a0, -12 -; LA64S-NEXT: lu32i.d $a0, 0 ; LA64S-NEXT: movgr2fr.w $fa0, $a0 ; LA64S-NEXT: ori $a0, $zero, 1 ; LA64S-NEXT: ori $a1, $zero, 2 @@ -326,7 +325,6 @@ define i32 @caller_half_in_fregs() nounwind { ; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 ; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64F-LP64D-NEXT: lu12i.w $a0, -12 -; LA64F-LP64D-NEXT: lu32i.d $a0, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 ; LA64F-LP64D-NEXT: ori $a0, $zero, 1 ; LA64F-LP64D-NEXT: ori $a1, $zero, 2 @@ -368,7 +366,6 @@ define i32 @caller_half_in_fregs() nounwind { ; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 ; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64D-LP64D-NEXT: lu12i.w $a0, -12 -; LA64D-LP64D-NEXT: lu32i.d $a0, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 ; LA64D-LP64D-NEXT: ori $a0, $zero, 1 ; LA64D-LP64D-NEXT: ori $a1, $zero, 2 @@ -688,32 +685,23 @@ define i32 @caller_half_in_gregs() nounwind { ; LA64S-NEXT: addi.d $sp, $sp, -16 ; LA64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64S-NEXT: lu12i.w $a1, -12 +; LA64S-NEXT: movgr2fr.w $fa1, $a1 ; LA64S-NEXT: ori $a0, $a1, 2176 +; LA64S-NEXT: lu12i.w $a2, -13 +; LA64S-NEXT: ori $a2, $a2, 3072 +; LA64S-NEXT: movgr2fr.w $fa0, $a2 ; LA64S-NEXT: ori $a2, $a1, 512 -; LA64S-NEXT: ori $a3, $a1, 1024 -; LA64S-NEXT: ori $a4, $a1, 1280 -; LA64S-NEXT: ori $a5, $a1, 1536 -; LA64S-NEXT: ori $a6, $a1, 1792 -; LA64S-NEXT: ori $a7, $a1, 2048 -; LA64S-NEXT: lu32i.d $a1, 0 -; LA64S-NEXT: movgr2fr.w $fa1, $a1 -; LA64S-NEXT: lu12i.w $a1, -13 -; LA64S-NEXT: ori $a1, $a1, 3072 -; LA64S-NEXT: lu32i.d $a1, 0 -; LA64S-NEXT: movgr2fr.w $fa0, $a1 -; LA64S-NEXT: lu32i.d $a2, 0 ; LA64S-NEXT: movgr2fr.w $fa2, $a2 -; LA64S-NEXT: lu32i.d $a3, 0 -; LA64S-NEXT: movgr2fr.w $fa3, $a3 -; LA64S-NEXT: lu32i.d $a4, 0 -; LA64S-NEXT: movgr2fr.w $fa4, $a4 -; LA64S-NEXT: lu32i.d $a5, 0 -; LA64S-NEXT: movgr2fr.w $fa5, $a5 -; LA64S-NEXT: lu32i.d $a0, 0 -; LA64S-NEXT: lu32i.d $a6, 0 -; LA64S-NEXT: movgr2fr.w $fa6, $a6 -; LA64S-NEXT: lu32i.d $a7, 0 -; LA64S-NEXT: movgr2fr.w $fa7, $a7 +; LA64S-NEXT: ori $a2, $a1, 1024 +; LA64S-NEXT: movgr2fr.w $fa3, $a2 +; LA64S-NEXT: ori $a2, $a1, 1280 +; LA64S-NEXT: movgr2fr.w $fa4, $a2 +; LA64S-NEXT: ori $a2, $a1, 1536 +; LA64S-NEXT: movgr2fr.w $fa5, $a2 +; LA64S-NEXT: ori $a2, $a1, 1792 +; LA64S-NEXT: movgr2fr.w $fa6, $a2 +; LA64S-NEXT: ori $a1, $a1, 2048 +; LA64S-NEXT: movgr2fr.w $fa7, $a1 ; LA64S-NEXT: ori $a1, $zero, 10 ; LA64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs) ; LA64S-NEXT: jirl $ra, $ra, 0 @@ -730,22 +718,14 @@ define i32 @caller_half_in_gregs() nounwind { ; LA64F-LP64S-NEXT: lu12i.w $a1, -12 ; LA64F-LP64S-NEXT: ori $t0, $a1, 2176 ; LA64F-LP64S-NEXT: lu32i.d $t0, 0 +; LA64F-LP64S-NEXT: lu12i.w $a0, -13 +; LA64F-LP64S-NEXT: ori $a0, $a0, 3072 ; LA64F-LP64S-NEXT: ori $a2, $a1, 512 ; LA64F-LP64S-NEXT: ori $a3, $a1, 1024 ; LA64F-LP64S-NEXT: ori $a4, $a1, 1280 ; LA64F-LP64S-NEXT: ori $a5, $a1, 1536 ; LA64F-LP64S-NEXT: ori $a6, $a1, 1792 ; LA64F-LP64S-NEXT: ori $a7, $a1, 2048 -; LA64F-LP64S-NEXT: lu32i.d $a1, 0 -; LA64F-LP64S-NEXT: lu12i.w $a0, -13 -; LA64F-LP64S-NEXT: ori $a0, $a0, 3072 -; LA64F-LP64S-NEXT: lu32i.d $a0, 0 -; LA64F-LP64S-NEXT: lu32i.d $a2, 0 -; LA64F-LP64S-NEXT: lu32i.d $a3, 0 -; LA64F-LP64S-NEXT: lu32i.d $a4, 0 -; LA64F-LP64S-NEXT: lu32i.d $a5, 0 -; LA64F-LP64S-NEXT: lu32i.d $a6, 0 -; LA64F-LP64S-NEXT: lu32i.d $a7, 0 ; LA64F-LP64S-NEXT: st.w $t0, $sp, 0 ; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs) ; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 @@ -758,32 +738,23 @@ define i32 @caller_half_in_gregs() nounwind { ; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 ; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64F-LP64D-NEXT: lu12i.w $a1, -12 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-LP64D-NEXT: ori $a0, $a1, 2176 +; LA64F-LP64D-NEXT: lu12i.w $a2, -13 +; LA64F-LP64D-NEXT: ori $a2, $a2, 3072 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a2 ; LA64F-LP64D-NEXT: ori $a2, $a1, 512 -; LA64F-LP64D-NEXT: ori $a3, $a1, 1024 -; LA64F-LP64D-NEXT: ori $a4, $a1, 1280 -; LA64F-LP64D-NEXT: ori $a5, $a1, 1536 -; LA64F-LP64D-NEXT: ori $a6, $a1, 1792 -; LA64F-LP64D-NEXT: ori $a7, $a1, 2048 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64F-LP64D-NEXT: lu12i.w $a1, -13 -; LA64F-LP64D-NEXT: ori $a1, $a1, 3072 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 -; LA64F-LP64D-NEXT: lu32i.d $a2, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa2, $a2 -; LA64F-LP64D-NEXT: lu32i.d $a3, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa3, $a3 -; LA64F-LP64D-NEXT: lu32i.d $a4, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa4, $a4 -; LA64F-LP64D-NEXT: lu32i.d $a5, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa5, $a5 -; LA64F-LP64D-NEXT: lu32i.d $a0, 0 -; LA64F-LP64D-NEXT: lu32i.d $a6, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa6, $a6 -; LA64F-LP64D-NEXT: lu32i.d $a7, 0 -; LA64F-LP64D-NEXT: movgr2fr.w $fa7, $a7 +; LA64F-LP64D-NEXT: ori $a2, $a1, 1024 +; LA64F-LP64D-NEXT: movgr2fr.w $fa3, $a2 +; LA64F-LP64D-NEXT: ori $a2, $a1, 1280 +; LA64F-LP64D-NEXT: movgr2fr.w $fa4, $a2 +; LA64F-LP64D-NEXT: ori $a2, $a1, 1536 +; LA64F-LP64D-NEXT: movgr2fr.w $fa5, $a2 +; LA64F-LP64D-NEXT: ori $a2, $a1, 1792 +; LA64F-LP64D-NEXT: movgr2fr.w $fa6, $a2 +; LA64F-LP64D-NEXT: ori $a1, $a1, 2048 +; LA64F-LP64D-NEXT: movgr2fr.w $fa7, $a1 ; LA64F-LP64D-NEXT: ori $a1, $zero, 10 ; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs) ; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 @@ -800,22 +771,14 @@ define i32 @caller_half_in_gregs() nounwind { ; LA64D-LP64S-NEXT: lu12i.w $a1, -12 ; LA64D-LP64S-NEXT: ori $t0, $a1, 2176 ; LA64D-LP64S-NEXT: lu32i.d $t0, 0 +; LA64D-LP64S-NEXT: lu12i.w $a0, -13 +; LA64D-LP64S-NEXT: ori $a0, $a0, 3072 ; LA64D-LP64S-NEXT: ori $a2, $a1, 512 ; LA64D-LP64S-NEXT: ori $a3, $a1, 1024 ; LA64D-LP64S-NEXT: ori $a4, $a1, 1280 ; LA64D-LP64S-NEXT: ori $a5, $a1, 1536 ; LA64D-LP64S-NEXT: ori $a6, $a1, 1792 ; LA64D-LP64S-NEXT: ori $a7, $a1, 2048 -; LA64D-LP64S-NEXT: lu32i.d $a1, 0 -; LA64D-LP64S-NEXT: lu12i.w $a0, -13 -; LA64D-LP64S-NEXT: ori $a0, $a0, 3072 -; LA64D-LP64S-NEXT: lu32i.d $a0, 0 -; LA64D-LP64S-NEXT: lu32i.d $a2, 0 -; LA64D-LP64S-NEXT: lu32i.d $a3, 0 -; LA64D-LP64S-NEXT: lu32i.d $a4, 0 -; LA64D-LP64S-NEXT: lu32i.d $a5, 0 -; LA64D-LP64S-NEXT: lu32i.d $a6, 0 -; LA64D-LP64S-NEXT: lu32i.d $a7, 0 ; LA64D-LP64S-NEXT: st.w $t0, $sp, 0 ; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs) ; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 @@ -828,32 +791,23 @@ define i32 @caller_half_in_gregs() nounwind { ; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 ; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64D-LP64D-NEXT: lu12i.w $a1, -12 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64D-LP64D-NEXT: ori $a0, $a1, 2176 +; LA64D-LP64D-NEXT: lu12i.w $a2, -13 +; LA64D-LP64D-NEXT: ori $a2, $a2, 3072 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a2 ; LA64D-LP64D-NEXT: ori $a2, $a1, 512 -; LA64D-LP64D-NEXT: ori $a3, $a1, 1024 -; LA64D-LP64D-NEXT: ori $a4, $a1, 1280 -; LA64D-LP64D-NEXT: ori $a5, $a1, 1536 -; LA64D-LP64D-NEXT: ori $a6, $a1, 1792 -; LA64D-LP64D-NEXT: ori $a7, $a1, 2048 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a1 -; LA64D-LP64D-NEXT: lu12i.w $a1, -13 -; LA64D-LP64D-NEXT: ori $a1, $a1, 3072 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 -; LA64D-LP64D-NEXT: lu32i.d $a2, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa2, $a2 -; LA64D-LP64D-NEXT: lu32i.d $a3, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa3, $a3 -; LA64D-LP64D-NEXT: lu32i.d $a4, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa4, $a4 -; LA64D-LP64D-NEXT: lu32i.d $a5, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa5, $a5 -; LA64D-LP64D-NEXT: lu32i.d $a0, 0 -; LA64D-LP64D-NEXT: lu32i.d $a6, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa6, $a6 -; LA64D-LP64D-NEXT: lu32i.d $a7, 0 -; LA64D-LP64D-NEXT: movgr2fr.w $fa7, $a7 +; LA64D-LP64D-NEXT: ori $a2, $a1, 1024 +; LA64D-LP64D-NEXT: movgr2fr.w $fa3, $a2 +; LA64D-LP64D-NEXT: ori $a2, $a1, 1280 +; LA64D-LP64D-NEXT: movgr2fr.w $fa4, $a2 +; LA64D-LP64D-NEXT: ori $a2, $a1, 1536 +; LA64D-LP64D-NEXT: movgr2fr.w $fa5, $a2 +; LA64D-LP64D-NEXT: ori $a2, $a1, 1792 +; LA64D-LP64D-NEXT: movgr2fr.w $fa6, $a2 +; LA64D-LP64D-NEXT: ori $a1, $a1, 2048 +; LA64D-LP64D-NEXT: movgr2fr.w $fa7, $a1 ; LA64D-LP64D-NEXT: ori $a1, $zero, 10 ; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(callee_half_in_gregs) ; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 @@ -1231,28 +1185,20 @@ define i32 @caller_half_on_stack() nounwind { ; LA64S-NEXT: ori $t0, $a0, 3200 ; LA64S-NEXT: lu32i.d $t0, 0 ; LA64S-NEXT: ori $a1, $a0, 2304 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa0, $a1 ; LA64S-NEXT: ori $a1, $a0, 2432 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa1, $a1 ; LA64S-NEXT: ori $a1, $a0, 2560 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa2, $a1 ; LA64S-NEXT: ori $a1, $a0, 2688 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa3, $a1 ; LA64S-NEXT: ori $a1, $a0, 2816 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa4, $a1 ; LA64S-NEXT: ori $a1, $a0, 2944 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa5, $a1 ; LA64S-NEXT: ori $a1, $a0, 3072 -; LA64S-NEXT: lu32i.d $a1, 0 ; LA64S-NEXT: movgr2fr.w $fa6, $a1 ; LA64S-NEXT: ori $a0, $a0, 3136 -; LA64S-NEXT: lu32i.d $a0, 0 ; LA64S-NEXT: movgr2fr.w $fa7, $a0 ; LA64S-NEXT: ori $a0, $zero, 1 ; LA64S-NEXT: ori $a1, $zero, 2 @@ -1323,28 +1269,20 @@ define i32 @caller_half_on_stack() nounwind { ; LA64F-LP64D-NEXT: ori $t0, $a0, 3200 ; LA64F-LP64D-NEXT: lu32i.d $t0, 0 ; LA64F-LP64D-NEXT: ori $a1, $a0, 2304 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 ; LA64F-LP64D-NEXT: ori $a1, $a0, 2432 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-LP64D-NEXT: ori $a1, $a0, 2560 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa2, $a1 ; LA64F-LP64D-NEXT: ori $a1, $a0, 2688 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa3, $a1 ; LA64F-LP64D-NEXT: ori $a1, $a0, 2816 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa4, $a1 ; LA64F-LP64D-NEXT: ori $a1, $a0, 2944 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa5, $a1 ; LA64F-LP64D-NEXT: ori $a1, $a0, 3072 -; LA64F-LP64D-NEXT: lu32i.d $a1, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa6, $a1 ; LA64F-LP64D-NEXT: ori $a0, $a0, 3136 -; LA64F-LP64D-NEXT: lu32i.d $a0, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa7, $a0 ; LA64F-LP64D-NEXT: ori $a0, $zero, 1 ; LA64F-LP64D-NEXT: ori $a1, $zero, 2 @@ -1415,28 +1353,20 @@ define i32 @caller_half_on_stack() nounwind { ; LA64D-LP64D-NEXT: ori $t0, $a0, 3200 ; LA64D-LP64D-NEXT: lu32i.d $t0, 0 ; LA64D-LP64D-NEXT: ori $a1, $a0, 2304 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 ; LA64D-LP64D-NEXT: ori $a1, $a0, 2432 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a1 ; LA64D-LP64D-NEXT: ori $a1, $a0, 2560 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa2, $a1 ; LA64D-LP64D-NEXT: ori $a1, $a0, 2688 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa3, $a1 ; LA64D-LP64D-NEXT: ori $a1, $a0, 2816 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa4, $a1 ; LA64D-LP64D-NEXT: ori $a1, $a0, 2944 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa5, $a1 ; LA64D-LP64D-NEXT: ori $a1, $a0, 3072 -; LA64D-LP64D-NEXT: lu32i.d $a1, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa6, $a1 ; LA64D-LP64D-NEXT: ori $a0, $a0, 3136 -; LA64D-LP64D-NEXT: lu32i.d $a0, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa7, $a0 ; LA64D-LP64D-NEXT: ori $a0, $zero, 1 ; LA64D-LP64D-NEXT: ori $a1, $zero, 2 @@ -1493,7 +1423,6 @@ define half @callee_half_ret() nounwind { ; LA64S: # %bb.0: ; LA64S-NEXT: lu12i.w $a0, -13 ; LA64S-NEXT: ori $a0, $a0, 3072 -; LA64S-NEXT: lu32i.d $a0, 0 ; LA64S-NEXT: movgr2fr.w $fa0, $a0 ; LA64S-NEXT: ret ; @@ -1501,14 +1430,12 @@ define half @callee_half_ret() nounwind { ; LA64F-LP64S: # %bb.0: ; LA64F-LP64S-NEXT: lu12i.w $a0, -13 ; LA64F-LP64S-NEXT: ori $a0, $a0, 3072 -; LA64F-LP64S-NEXT: lu32i.d $a0, 0 ; LA64F-LP64S-NEXT: ret ; ; LA64F-LP64D-LABEL: callee_half_ret: ; LA64F-LP64D: # %bb.0: ; LA64F-LP64D-NEXT: lu12i.w $a0, -13 ; LA64F-LP64D-NEXT: ori $a0, $a0, 3072 -; LA64F-LP64D-NEXT: lu32i.d $a0, 0 ; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 ; LA64F-LP64D-NEXT: ret ; @@ -1516,14 +1443,12 @@ define half @callee_half_ret() nounwind { ; LA64D-LP64S: # %bb.0: ; LA64D-LP64S-NEXT: lu12i.w $a0, -13 ; LA64D-LP64S-NEXT: ori $a0, $a0, 3072 -; LA64D-LP64S-NEXT: lu32i.d $a0, 0 ; LA64D-LP64S-NEXT: ret ; ; LA64D-LP64D-LABEL: callee_half_ret: ; LA64D-LP64D: # %bb.0: ; LA64D-LP64D-NEXT: lu12i.w $a0, -13 ; LA64D-LP64D-NEXT: ori $a0, $a0, 3072 -; LA64D-LP64D-NEXT: lu32i.d $a0, 0 ; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 ; LA64D-LP64D-NEXT: ret ret half 1.0 diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll index a6e3f79..0d0fb21 100644 --- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll +++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll @@ -76,7 +76,6 @@ define float @float_fsub_acquire(ptr %p) nounwind { ; LA64F: # %bb.0: ; LA64F-NEXT: fld.s $fa0, $a0, 0 ; LA64F-NEXT: lu12i.w $a1, -264192 -; LA64F-NEXT: lu32i.d $a1, 0 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB1_1: # %atomicrmw.start @@ -641,7 +640,6 @@ define float @float_fsub_release(ptr %p) nounwind { ; LA64F: # %bb.0: ; LA64F-NEXT: fld.s $fa0, $a0, 0 ; LA64F-NEXT: lu12i.w $a1, -264192 -; LA64F-NEXT: lu32i.d $a1, 0 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB9_1: # %atomicrmw.start @@ -1206,7 +1204,6 @@ define float @float_fsub_acq_rel(ptr %p) nounwind { ; LA64F: # %bb.0: ; LA64F-NEXT: fld.s $fa0, $a0, 0 ; LA64F-NEXT: lu12i.w $a1, -264192 -; LA64F-NEXT: lu32i.d $a1, 0 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB17_1: # %atomicrmw.start @@ -1771,7 +1768,6 @@ define float @float_fsub_seq_cst(ptr %p) nounwind { ; LA64F: # %bb.0: ; LA64F-NEXT: fld.s $fa0, $a0, 0 ; LA64F-NEXT: lu12i.w $a1, -264192 -; LA64F-NEXT: lu32i.d $a1, 0 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB25_1: # %atomicrmw.start @@ -2336,7 +2332,6 @@ define float @float_fsub_monotonic(ptr %p) nounwind { ; LA64F: # %bb.0: ; LA64F-NEXT: fld.s $fa0, $a0, 0 ; LA64F-NEXT: lu12i.w $a1, -264192 -; LA64F-NEXT: lu32i.d $a1, 0 ; LA64F-NEXT: movgr2fr.w $fa1, $a1 ; LA64F-NEXT: .p2align 4, , 16 ; LA64F-NEXT: .LBB33_1: # %atomicrmw.start diff --git a/llvm/test/CodeGen/PowerPC/aix-alloca-r31.ll b/llvm/test/CodeGen/PowerPC/aix-alloca-r31.ll index edfa0b9..2ee6e08 100644 --- a/llvm/test/CodeGen/PowerPC/aix-alloca-r31.ll +++ b/llvm/test/CodeGen/PowerPC/aix-alloca-r31.ll @@ -31,7 +31,7 @@ define i32 @varalloca() local_unnamed_addr { ; CHECK-ASM32-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-ASM32-NEXT: .byte 0x00 # Version = 0 ; CHECK-ASM32-NEXT: .byte 0x09 # Language = CPlusPlus -; CHECK-ASM32-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; CHECK-ASM32-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-ASM32-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-ASM32-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-ASM32-NEXT: # -IsFloatingPointPresent @@ -70,7 +70,7 @@ define i32 @varalloca() local_unnamed_addr { ; CHECK-ASM64-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-ASM64-NEXT: .byte 0x00 # Version = 0 ; CHECK-ASM64-NEXT: .byte 0x09 # Language = CPlusPlus -; CHECK-ASM64-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; CHECK-ASM64-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-ASM64-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-ASM64-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-ASM64-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-clobber-register.ll b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-clobber-register.ll index 42bd478..8e4e0d3 100644 --- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-clobber-register.ll +++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-clobber-register.ll @@ -49,7 +49,7 @@ entry: ; COMMON: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x22 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x22 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # +IsFloatingPointPresent @@ -70,7 +70,7 @@ entry: ; COMMON-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-redzone-boundary.mir b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-redzone-boundary.mir index 3d4b5a7..7041315 100644 --- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-redzone-boundary.mir +++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-redzone-boundary.mir @@ -25,7 +25,7 @@ body: | ; CHECK: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-NEXT: .byte 0x00 # Version = 0 ; CHECK-NEXT: .byte 0x09 # Language = CPlusPlus - ; CHECK-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue + ; CHECK-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-NEXT: # -IsFloatingPointPresent @@ -43,7 +43,7 @@ body: | ; CHECK: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-NEXT: .byte 0x00 # Version = 0 ; CHECK-NEXT: .byte 0x09 # Language = CPlusPlus - ; CHECK-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue + ; CHECK-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo.ll b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo.ll index 83e413a..f03a6c0 100644 --- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo.ll +++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo.ll @@ -82,7 +82,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1 ; COMMON-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x22 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x22 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # +IsFloatingPointPresent @@ -107,7 +107,7 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1 ; COMMON-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x22 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x22 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # +IsFloatingPointPresent diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo_hasvarg.ll b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo_hasvarg.ll index 8c0a589..26506f8 100644 --- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo_hasvarg.ll +++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable-vectorinfo_hasvarg.ll @@ -15,7 +15,7 @@ entry: ;CHECK-ASM: .vbyte 4, 0x00000000 # Traceback table begin ;CHECK-ASM-NEXT: .byte 0x00 # Version = 0 ;CHECK-ASM-NEXT: .byte 0x09 # Language = CPlusPlus -;CHECK-ASM-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +;CHECK-ASM-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ;CHECK-ASM-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ;CHECK-ASM-NEXT: # -HasControlledStorage, -IsTOCless ;CHECK-ASM-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll index ce97f37..2827155 100644 --- a/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll +++ b/llvm/test/CodeGen/PowerPC/aix-emit-tracebacktable.ll @@ -138,7 +138,7 @@ entry: ; COMMON-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x22 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x22 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # +IsFloatingPointPresent @@ -167,7 +167,7 @@ entry: ; COMMON-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x22 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x22 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # +IsFloatingPointPresent @@ -190,7 +190,7 @@ entry: ; COMMON: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x22 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x22 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # +IsFloatingPointPresent @@ -217,7 +217,7 @@ entry: ; COMMON-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; COMMON-NEXT: .byte 0x00 # Version = 0 ; COMMON-NEXT: .byte 0x09 # Language = CPlusPlus -; COMMON-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; COMMON-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; COMMON-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; COMMON-NEXT: # -HasControlledStorage, -IsTOCless ; COMMON-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/CodeGen/PowerPC/aix-exception.ll b/llvm/test/CodeGen/PowerPC/aix-exception.ll index 5035d8e..5b364ef 100644 --- a/llvm/test/CodeGen/PowerPC/aix-exception.ll +++ b/llvm/test/CodeGen/PowerPC/aix-exception.ll @@ -113,7 +113,7 @@ eh.resume: ; preds = %catch.dispatch ; ASM: .vbyte 4, 0x00000000 # Traceback table begin ; ASM: .byte 0x00 # Version = 0 ; ASM: .byte 0x09 # Language = CPlusPlus -; ASM: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; ASM: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; ASM: # +HasTraceBackTableOffset, -IsInternalProcedure ; ASM: # -HasControlledStorage, -IsTOCless ; ASM: # -IsFloatingPointPresent diff --git a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll index 7bde1b7..7cdfd51 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll @@ -7,12 +7,15 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: addq %rdi, %rax +; X64-NEXT: setb %dl +; X64-NEXT: cmpb $1, %dl ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq ; ; X86-LABEL: test_add_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -21,8 +24,14 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setb %bl +; X86-NEXT: cmpb $1, %bl ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setb %bl +; X86-NEXT: cmpb $1, %bl ; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setb %bl +; X86-NEXT: cmpb $1, %bl ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %edx, 4(%eax) @@ -30,6 +39,7 @@ define i128 @test_add_i128(i128 %arg1, i128 %arg2) nounwind { ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %ret = add i128 %arg1, %arg2 ret i128 %ret @@ -46,6 +56,8 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: setb %cl +; X86-NEXT: cmpb $1, %cl ; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl %ret = add i64 %arg1, %arg2 diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir index ec9db78..dae2ad6 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-add.mir @@ -157,8 +157,8 @@ body: | ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV2]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) ; X86-NEXT: $rax = COPY [[MV]](s64) ; X86-NEXT: RET 0 @@ -192,8 +192,8 @@ body: | ; X86-NEXT: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64) ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64) - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV2]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) ; X86-NEXT: $rax = COPY [[MV]](s64) ; X86-NEXT: RET 0 @@ -219,8 +219,8 @@ body: | ; X64-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF ; X64-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](s128) ; X64-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF1]](s128) - ; X64-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]] - ; X64-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] + ; X64-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV2]] + ; X64-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]] ; X64-NEXT: $rax = COPY [[UADDO]](s64) ; X64-NEXT: $rdx = COPY [[UADDE]](s64) ; X64-NEXT: RET 0 @@ -230,10 +230,10 @@ body: | ; X86-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s128) ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s128) - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV4]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV5]], [[UADDO1]] - ; X86-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV2]], [[UV6]], [[UADDE1]] - ; X86-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UV3]], [[UV7]], [[UADDE3]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[UV]], [[UV4]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[UV1]], [[UV5]], [[UADDO1]] + ; X86-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s8) = G_UADDE [[UV2]], [[UV6]], [[UADDE1]] + ; X86-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s8) = G_UADDE [[UV3]], [[UV7]], [[UADDE3]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) ; X86-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE2]](s32), [[UADDE4]](s32) ; X86-NEXT: $rax = COPY [[MV]](s64) diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir index 19fe5b8..470a30fd 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir @@ -25,6 +25,7 @@ body: | ; X64-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[CTLZ]], [[C1]] ; X64-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C]] ; X64-NEXT: RET 0, implicit [[AND1]](s64) + ; ; X86-LABEL: name: test_ctlz35 ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx ; X86-NEXT: [[TRUNC:%[0-9]+]]:_(s35) = G_TRUNC [[COPY]](s64) @@ -46,12 +47,15 @@ body: | ; X86-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C]](s32) ; X86-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) ; X86-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV1]](s64) - ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[UV8]] - ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[UV9]], [[USUBO1]] + ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV6]], [[UV8]] + ; X86-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[USUBO1]](s8) + ; X86-NEXT: [[ZEXT2:%[0-9]+]]:_(s8) = G_ZEXT [[TRUNC1]](s1) + ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV7]], [[UV9]], [[ZEXT2]] + ; X86-NEXT: [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[USUBE1]](s8) ; X86-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) - ; X86-NEXT: [[TRUNC1:%[0-9]+]]:_(s35) = G_TRUNC [[MV2]](s64) - ; X86-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC1]](s35) - ; X86-NEXT: RET 0, implicit [[ZEXT2]](s64) + ; X86-NEXT: [[TRUNC3:%[0-9]+]]:_(s35) = G_TRUNC [[MV2]](s64) + ; X86-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC3]](s35) + ; X86-NEXT: RET 0, implicit [[ZEXT3]](s64) %0(s64) = COPY $rdx %1:_(s35) = G_TRUNC %0(s64) %2:_(s35) = G_CTLZ %1 @@ -97,6 +101,7 @@ body: | ; X64-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[DEF]](s64) ; X64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[CTLZ]](s64) ; X64-NEXT: RET 0, implicit [[COPY]](s64) + ; ; X86-LABEL: name: test_ctlz64 ; X86: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64) diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir index ee2b9ee..ac3bf33 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-sub.mir @@ -157,8 +157,8 @@ body: | ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) - ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] - ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV2]] + ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; X86-NEXT: $rax = COPY [[MV]](s64) ; X86-NEXT: RET 0 @@ -192,8 +192,8 @@ body: | ; X86-NEXT: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64) ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64) - ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] - ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV2]] + ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; X86-NEXT: $rax = COPY [[MV]](s64) ; X86-NEXT: RET 0 @@ -219,8 +219,8 @@ body: | ; X64-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF ; X64-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](s128) ; X64-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF1]](s128) - ; X64-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]] - ; X64-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] + ; X64-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV2]] + ; X64-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]] ; X64-NEXT: $rax = COPY [[USUBO]](s64) ; X64-NEXT: $rdx = COPY [[USUBE]](s64) ; X64-NEXT: RET 0 @@ -230,10 +230,10 @@ body: | ; X86-NEXT: [[DEF1:%[0-9]+]]:_(s128) = IMPLICIT_DEF ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s128) ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s128) - ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV4]] - ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV5]], [[USUBO1]] - ; X86-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV2]], [[UV6]], [[USUBE1]] - ; X86-NEXT: [[USUBE4:%[0-9]+]]:_(s32), [[USUBE5:%[0-9]+]]:_(s1) = G_USUBE [[UV3]], [[UV7]], [[USUBE3]] + ; X86-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s8) = G_USUBO [[UV]], [[UV4]] + ; X86-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s8) = G_USUBE [[UV1]], [[UV5]], [[USUBO1]] + ; X86-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s8) = G_USUBE [[UV2]], [[UV6]], [[USUBE1]] + ; X86-NEXT: [[USUBE4:%[0-9]+]]:_(s32), [[USUBE5:%[0-9]+]]:_(s8) = G_USUBE [[UV3]], [[UV7]], [[USUBE3]] ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32) ; X86-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBE2]](s32), [[USUBE4]](s32) ; X86-NEXT: $rax = COPY [[MV]](s64) diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir index 9807d13..57e729f 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir @@ -32,8 +32,8 @@ body: | ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[OR]](s32), [[C]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 @@ -97,8 +97,8 @@ body: | ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[UV]](s32), [[C]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32) ; X86-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C1]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C1]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir index e2d10423..f5d8477 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir @@ -32,8 +32,8 @@ body: | ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[OR]](s32), [[C]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR1]](s32) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 @@ -99,8 +99,8 @@ body: | ; X86-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(eq), [[UV]](s32), [[C]] ; X86-NEXT: [[CTTZ:%[0-9]+]]:_(s32) = G_CTTZ [[UV1]](s32) ; X86-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ]], [[C1]] - ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]] + ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s8) = G_UADDO [[CTTZ]], [[C1]] + ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s8) = G_UADDE [[C]], [[C]], [[UADDO1]] ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32) ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8) ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 diff --git a/llvm/test/CodeGen/X86/GlobalISel/pr49087.ll b/llvm/test/CodeGen/X86/GlobalISel/pr49087.ll new file mode 100644 index 0000000..41d890b --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/pr49087.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -global-isel -global-isel-abort=1 < %s 2>&1 | FileCheck %s + +define i32 @test_01(ptr %p, i64 %len, i32 %x) { +; CHECK-LABEL: test_01: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq %rax, %rsi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: jne .LBB0_4 +; CHECK-NEXT: # %bb.2: # %backedge +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: imulq $4, %rsi, %rcx +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: cmpl %edx, (%rcx) +; CHECK-NEXT: sete %cl +; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.3: # %failure +; CHECK-NEXT: .LBB0_4: # %exit +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: retq + +entry: + %scevgep = getelementptr i32, ptr %p, i64 -1 + br label %loop + +loop: ; preds = %backedge, %entry + %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] + %iv.next = add i64 %iv, -1 + %cond_1 = icmp eq i64 %iv, 0 + br i1 %cond_1, label %exit, label %backedge + +backedge: ; preds = %loop + %scevgep1 = getelementptr i32, ptr %scevgep, i64 %iv + %loaded = load atomic i32, ptr %scevgep1 unordered, align 4 + %cond_2 = icmp eq i32 %loaded, %x + br i1 %cond_2, label %failure, label %loop + +exit: ; preds = %loop + ret i32 -1 + +failure: + unreachable +} + diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir index 8eac3eaf..76680ac 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir @@ -29,8 +29,8 @@ body: | bb.0 (%ir-block.0): %0(s32) = IMPLICIT_DEF %1(s32) = IMPLICIT_DEF - %2(s1) = IMPLICIT_DEF - %3(s32), %4(s1) = G_UADDE %0, %1, %2 + %2(s8) = IMPLICIT_DEF + %3(s32), %4(s8) = G_UADDE %0, %1, %2 RET 0 ... diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir b/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir index 773813f..b85180f 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-add-x32.mir @@ -27,25 +27,24 @@ body: | bb.0 (%ir-block.0): ; X32-LABEL: name: test_add_i64 ; X32: [[DEF:%[0-9]+]]:gr32 = IMPLICIT_DEF - ; X32: [[DEF1:%[0-9]+]]:gr32 = IMPLICIT_DEF - ; X32: [[DEF2:%[0-9]+]]:gr32 = IMPLICIT_DEF - ; X32: [[DEF3:%[0-9]+]]:gr32 = IMPLICIT_DEF - ; X32: [[ADD32rr:%[0-9]+]]:gr32 = ADD32rr [[DEF]], [[DEF2]], implicit-def $eflags - ; X32: [[COPY:%[0-9]+]]:gr32 = COPY $eflags - ; X32: $eflags = COPY [[COPY]] - ; X32: [[ADC32rr:%[0-9]+]]:gr32 = ADC32rr [[DEF1]], [[DEF3]], implicit-def $eflags, implicit $eflags - ; X32: [[COPY1:%[0-9]+]]:gr32 = COPY $eflags - ; X32: $eax = COPY [[ADD32rr]] - ; X32: $edx = COPY [[ADC32rr]] - ; X32: RET 0, implicit $eax, implicit $edx + ; X32-NEXT: [[DEF1:%[0-9]+]]:gr32 = IMPLICIT_DEF + ; X32-NEXT: [[DEF2:%[0-9]+]]:gr32 = IMPLICIT_DEF + ; X32-NEXT: [[DEF3:%[0-9]+]]:gr32 = IMPLICIT_DEF + ; X32-NEXT: [[ADD32rr:%[0-9]+]]:gr32 = ADD32rr [[DEF]], [[DEF2]], implicit-def $eflags + ; X32-NEXT: [[SETCCr:%[0-9]+]]:gr8 = SETCCr 2, implicit $eflags + ; X32-NEXT: CMP8ri [[SETCCr]], 1, implicit-def $eflags + ; X32-NEXT: [[ADC32rr:%[0-9]+]]:gr32 = ADC32rr [[DEF1]], [[DEF3]], implicit-def $eflags, implicit $eflags + ; X32-NEXT: [[SETCCr1:%[0-9]+]]:gr8 = SETCCr 2, implicit $eflags + ; X32-NEXT: $eax = COPY [[ADD32rr]] + ; X32-NEXT: $edx = COPY [[ADC32rr]] + ; X32-NEXT: RET 0, implicit $eax, implicit $edx %0(s32) = IMPLICIT_DEF %1(s32) = IMPLICIT_DEF %2(s32) = IMPLICIT_DEF %3(s32) = IMPLICIT_DEF %9(s8) = G_CONSTANT i8 0 - %4(s1) = G_TRUNC %9(s8) - %5(s32), %6(s1) = G_UADDE %0, %2, %4 - %7(s32), %8(s1) = G_UADDE %1, %3, %6 + %5(s32), %6(s8) = G_UADDE %0, %2, %9 + %7(s32), %8(s8) = G_UADDE %1, %3, %6 $eax = COPY %5(s32) $edx = COPY %7(s32) RET 0, implicit $eax, implicit $edx diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll b/llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll new file mode 100644 index 0000000..0cf1372 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/select-get-carry-bit.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel=1 -global-isel-abort=1 | FileCheck %s + +; Issue #120029 +define i16 @use_carry_bit(i16 %2) { +; CHECK-LABEL: use_carry_bit: +; CHECK: # %bb.0: +; CHECK-NEXT: movw $1, %ax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: addw %di, %ax +; CHECK-NEXT: setb %cl +; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: cmovnew %di, %ax +; CHECK-NEXT: retq + %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %2, i16 1) + %res = extractvalue { i16, i1 } %uadd, 0 + %carry = extractvalue { i16, i1 } %uadd, 1 + %ret = select i1 %carry, i16 %2, i16 %res + ret i16 %ret +} + diff --git a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll index 7a035f5..be75d7c 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll @@ -7,12 +7,15 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rdx, %rax +; X64-NEXT: setb %dl +; X64-NEXT: cmpb $1, %dl ; X64-NEXT: sbbq %rcx, %rsi ; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq ; ; X86-LABEL: test_sub_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -21,8 +24,14 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: setb %bl +; X86-NEXT: cmpb $1, %bl ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setb %bl +; X86-NEXT: cmpb $1, %bl ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setb %bl +; X86-NEXT: cmpb $1, %bl ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %edx, 4(%eax) @@ -30,6 +39,7 @@ define i128 @test_sub_i128(i128 %arg1, i128 %arg2) nounwind { ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %ret = sub i128 %arg1, %arg2 ret i128 %ret @@ -47,6 +57,8 @@ define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: setb %cl +; X86-NEXT: cmpb $1, %cl ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx ; X86-NEXT: retl %ret = sub i64 %arg1, %arg2 diff --git a/llvm/test/CodeGen/X86/pr49087.ll b/llvm/test/CodeGen/X86/pr49087.ll deleted file mode 100644 index 1a29222..0000000 --- a/llvm/test/CodeGen/X86/pr49087.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -global-isel < %s 2>&1 | FileCheck %s -; REQUIRES: asserts -; XFAIL: * - -define i32 @test_01(ptr %p, i64 %len, i32 %x) { -; CHECK-LABEL: test_01 - -entry: - %scevgep = getelementptr i32, ptr %p, i64 -1 - br label %loop - -loop: ; preds = %backedge, %entry - %iv = phi i64 [ %iv.next, %backedge ], [ %len, %entry ] - %iv.next = add i64 %iv, -1 - %cond_1 = icmp eq i64 %iv, 0 - br i1 %cond_1, label %exit, label %backedge - -backedge: ; preds = %loop - %scevgep1 = getelementptr i32, ptr %scevgep, i64 %iv - %loaded = load atomic i32, ptr %scevgep1 unordered, align 4 - %cond_2 = icmp eq i32 %loaded, %x - br i1 %cond_2, label %failure, label %loop - -exit: ; preds = %loop - ret i32 -1 - -failure: - unreachable -} - diff --git a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll index 446a84d..ffdc80a 100644 --- a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll +++ b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll @@ -101,7 +101,7 @@ ; CHECK-NEXT: LocalVariableAddrRange { ; CHECK-NEXT: OffsetStart: .text+0x0 ; CHECK-NEXT: ISectStart: 0x0 -; CHECK-NEXT: Range: 0xBC +; CHECK-NEXT: Range: 0xB8 ; CHECK-NEXT: } ; CHECK-NEXT: } ; CHECK-NEXT: ProcEnd { diff --git a/llvm/test/DebugInfo/XCOFF/empty.ll b/llvm/test/DebugInfo/XCOFF/empty.ll index af2f74f..24655e5 100644 --- a/llvm/test/DebugInfo/XCOFF/empty.ll +++ b/llvm/test/DebugInfo/XCOFF/empty.ll @@ -61,7 +61,7 @@ entry: ; ASM32-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; ASM32-NEXT: .byte 0x00 # Version = 0 ; ASM32-NEXT: .byte 0x09 # Language = CPlusPlus -; ASM32-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; ASM32-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; ASM32-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; ASM32-NEXT: # -HasControlledStorage, -IsTOCless ; ASM32-NEXT: # -IsFloatingPointPresent @@ -264,7 +264,7 @@ entry: ; ASM64-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; ASM64-NEXT: .byte 0x00 # Version = 0 ; ASM64-NEXT: .byte 0x09 # Language = CPlusPlus -; ASM64-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; ASM64-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; ASM64-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; ASM64-NEXT: # -HasControlledStorage, -IsTOCless ; ASM64-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/DebugInfo/XCOFF/explicit-section.ll b/llvm/test/DebugInfo/XCOFF/explicit-section.ll index 0ae9289..3bcc0f9 100644 --- a/llvm/test/DebugInfo/XCOFF/explicit-section.ll +++ b/llvm/test/DebugInfo/XCOFF/explicit-section.ll @@ -65,7 +65,7 @@ entry: ; CHECK-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-NEXT: .byte 0x00 # Version = 0 ; CHECK-NEXT: .byte 0x09 # Language = CPlusPlus -; CHECK-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; CHECK-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-NEXT: # -IsFloatingPointPresent @@ -113,7 +113,7 @@ entry: ; CHECK-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-NEXT: .byte 0x00 # Version = 0 ; CHECK-NEXT: .byte 0x09 # Language = CPlusPlus -; CHECK-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; CHECK-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/DebugInfo/XCOFF/function-sections.ll b/llvm/test/DebugInfo/XCOFF/function-sections.ll index 6a86ae6..0b7a03b 100644 --- a/llvm/test/DebugInfo/XCOFF/function-sections.ll +++ b/llvm/test/DebugInfo/XCOFF/function-sections.ll @@ -60,7 +60,7 @@ entry: ; CHECK-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-NEXT: .byte 0x00 # Version = 0 ; CHECK-NEXT: .byte 0x09 # Language = CPlusPlus -; CHECK-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; CHECK-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-NEXT: # -IsFloatingPointPresent @@ -95,7 +95,7 @@ entry: ; CHECK-NEXT: .vbyte 4, 0x00000000 # Traceback table begin ; CHECK-NEXT: .byte 0x00 # Version = 0 ; CHECK-NEXT: .byte 0x09 # Language = CPlusPlus -; CHECK-NEXT: .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue +; CHECK-NEXT: .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue ; CHECK-NEXT: # +HasTraceBackTableOffset, -IsInternalProcedure ; CHECK-NEXT: # -HasControlledStorage, -IsTOCless ; CHECK-NEXT: # -IsFloatingPointPresent diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll index d25d0f1..4c0f9db 100644 --- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll +++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll @@ -380,9 +380,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float ; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]] -; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]] ; CHECK: [[BB4:.*]]: -; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP38:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP57:%.*]], %[[FREM_ELSE20]] ] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000 ; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]] ; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]]) @@ -396,9 +396,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float ; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float ; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] ; CHECK: [[BB14:.*]]: -; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP48:%.*]], %[[FREM_ELSE20]] ] +; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP46:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP38:%.*]], %[[FREM_ELSE]] ] ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]] ; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]]) @@ -408,12 +408,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: store <2 x half> [[R2]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: ret void ; CHECK: [[FREM_COMPUTE]]: -; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) +; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]]) ; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 ; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1 ; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1 ; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 11) -; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) +; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]]) ; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 ; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1 @@ -423,10 +423,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 11 ; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] ; CHECK: [[FREM_ELSE]]: -; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]]) -; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX1]], [[AY2]] -; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], half [[TMP27]], half [[TMP1]] -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP28:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]]) +; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq float [[AX16]], [[AY17]] +; CHECK-NEXT: [[TMP38]] = select i1 [[TMP29]], half [[TMP28]], half [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_LOOP_BODY]]: ; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] ; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] @@ -456,15 +456,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]] ; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]]) ; CHECK-NEXT: [[TMP37:%.*]] = fptrunc float [[AX13]] to half -; CHECK-NEXT: [[TMP38]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP1]]) -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP46]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_COMPUTE19]]: -; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]]) +; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) ; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0 ; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1 ; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP41]], 1 ; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP40]], i32 11) -; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]]) +; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) ; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP42]], 0 ; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP42]], 1 ; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP44]], 1 @@ -474,10 +474,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 [[NB25]], 11 ; CHECK-NEXT: br i1 [[TMP45]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]] ; CHECK: [[FREM_ELSE20]]: -; CHECK-NEXT: [[TMP46:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]]) -; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX16]], [[AY17]] -; CHECK-NEXT: [[TMP48]] = select i1 [[TMP47]], half [[TMP46]], half [[TMP11]] -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]]) +; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]] +; CHECK-NEXT: [[TMP57]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] ; CHECK: [[FREM_LOOP_BODY27]]: ; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ] ; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ] @@ -507,8 +507,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]] ; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]]) ; CHECK-NEXT: [[TMP56:%.*]] = fptrunc float [[AX46]] to half -; CHECK-NEXT: [[TMP57]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP11]]) -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] ; ptr addrspace(1) %in2) { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 @@ -532,9 +532,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float ; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]] -; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]] ; CHECK: [[BB4:.*]]: -; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP116:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP115:%.*]], %[[FREM_ELSE86]] ] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000 ; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]] ; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]]) @@ -548,9 +548,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float ; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float ; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]] ; CHECK: [[BB14:.*]]: -; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP77:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP68:%.*]], %[[FREM_ELSE20]] ] +; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP104:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP96:%.*]], %[[FREM_ELSE53]] ] ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]] ; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]]) @@ -564,9 +564,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX49:%.*]] = fpext half [[AX47]] to float ; CHECK-NEXT: [[AY50:%.*]] = fpext half [[AY48]] to float ; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX49]], [[AY50]] -; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]] +; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]] ; CHECK: [[BB24:.*]]: -; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP96:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP87:%.*]], %[[FREM_ELSE53]] ] +; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP85:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP77:%.*]], %[[FREM_ELSE20]] ] ; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq half [[TMP22]], 0xH0000 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], half 0xH7E00, half [[RET51]] ; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.fabs.f16(half [[TMP21]]) @@ -580,9 +580,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX82:%.*]] = fpext half [[AX80]] to float ; CHECK-NEXT: [[AY83:%.*]] = fpext half [[AY81]] to float ; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX82]], [[AY83]] -; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] ; CHECK: [[BB34:.*]]: -; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP115:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP106:%.*]], %[[FREM_ELSE86]] ] +; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP66:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP58:%.*]], %[[FREM_ELSE]] ] ; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq half [[TMP32]], 0xH0000 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], half 0xH7E00, half [[RET84]] ; CHECK-NEXT: [[TMP37:%.*]] = call half @llvm.fabs.f16(half [[TMP31]]) @@ -592,12 +592,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: store <4 x half> [[R2]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; CHECK: [[FREM_COMPUTE]]: -; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) +; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]]) ; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 ; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 ; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1 ; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 11) -; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) +; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]]) ; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0 ; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1 ; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1 @@ -607,10 +607,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 11 ; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] ; CHECK: [[FREM_ELSE]]: -; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]]) -; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]] -; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]] -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP48:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]]) +; CHECK-NEXT: [[TMP49:%.*]] = fcmp oeq float [[AX82]], [[AY83]] +; CHECK-NEXT: [[TMP58]] = select i1 [[TMP49]], half [[TMP48]], half [[TMP31]] +; CHECK-NEXT: br label %[[BB34]] ; CHECK: [[FREM_LOOP_BODY]]: ; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] ; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] @@ -640,15 +640,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]] ; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]]) ; CHECK-NEXT: [[TMP57:%.*]] = fptrunc float [[AX13]] to half -; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP1]]) -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP66]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP31]]) +; CHECK-NEXT: br label %[[BB34]] ; CHECK: [[FREM_COMPUTE19]]: -; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]]) +; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]]) ; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP59]], 0 ; CHECK-NEXT: [[TMP61:%.*]] = extractvalue { float, i32 } [[TMP59]], 1 ; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP61]], 1 ; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP60]], i32 11) -; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]]) +; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]]) ; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP62]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = extractvalue { float, i32 } [[TMP62]], 1 ; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP64]], 1 @@ -658,10 +658,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP65:%.*]] = icmp sgt i32 [[NB25]], 11 ; CHECK-NEXT: br i1 [[TMP65]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]] ; CHECK: [[FREM_ELSE20]]: -; CHECK-NEXT: [[TMP66:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]]) -; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX16]], [[AY17]] -; CHECK-NEXT: [[TMP68]] = select i1 [[TMP67]], half [[TMP66]], half [[TMP11]] -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP67:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]]) +; CHECK-NEXT: [[TMP68:%.*]] = fcmp oeq float [[AX49]], [[AY50]] +; CHECK-NEXT: [[TMP77]] = select i1 [[TMP68]], half [[TMP67]], half [[TMP21]] +; CHECK-NEXT: br label %[[BB24]] ; CHECK: [[FREM_LOOP_BODY27]]: ; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ] ; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ] @@ -691,15 +691,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]] ; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]]) ; CHECK-NEXT: [[TMP76:%.*]] = fptrunc float [[AX46]] to half -; CHECK-NEXT: [[TMP77]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP11]]) -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP85]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP21]]) +; CHECK-NEXT: br label %[[BB24]] ; CHECK: [[FREM_COMPUTE52]]: -; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]]) +; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]]) ; CHECK-NEXT: [[TMP79:%.*]] = extractvalue { float, i32 } [[TMP78]], 0 ; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP78]], 1 ; CHECK-NEXT: [[EX54:%.*]] = sub i32 [[TMP80]], 1 ; CHECK-NEXT: [[AX55:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP79]], i32 11) -; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]]) +; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]]) ; CHECK-NEXT: [[TMP82:%.*]] = extractvalue { float, i32 } [[TMP81]], 0 ; CHECK-NEXT: [[TMP83:%.*]] = extractvalue { float, i32 } [[TMP81]], 1 ; CHECK-NEXT: [[EY56:%.*]] = sub i32 [[TMP83]], 1 @@ -709,10 +709,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP84:%.*]] = icmp sgt i32 [[NB58]], 11 ; CHECK-NEXT: br i1 [[TMP84]], label %[[FREM_LOOP_BODY60:.*]], label %[[FREM_LOOP_EXIT61]] ; CHECK: [[FREM_ELSE53]]: -; CHECK-NEXT: [[TMP85:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]]) -; CHECK-NEXT: [[TMP86:%.*]] = fcmp oeq float [[AX49]], [[AY50]] -; CHECK-NEXT: [[TMP87]] = select i1 [[TMP86]], half [[TMP85]], half [[TMP21]] -; CHECK-NEXT: br label %[[BB24]] +; CHECK-NEXT: [[TMP86:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]]) +; CHECK-NEXT: [[TMP87:%.*]] = fcmp oeq float [[AX16]], [[AY17]] +; CHECK-NEXT: [[TMP96]] = select i1 [[TMP87]], half [[TMP86]], half [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_LOOP_BODY60]]: ; CHECK-NEXT: [[NB_IV62:%.*]] = phi i32 [ [[NB58]], %[[FREM_COMPUTE52]] ], [ [[NB_UPDATE70:%.*]], %[[FREM_LOOP_BODY60]] ] ; CHECK-NEXT: [[AX_LOOP_PHI63:%.*]] = phi float [ [[AX55]], %[[FREM_COMPUTE52]] ], [ [[AX_UPDATE69:%.*]], %[[FREM_LOOP_BODY60]] ] @@ -742,15 +742,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX78:%.*]] = select i1 [[CLT76]], float [[AXP77]], float [[AX75]] ; CHECK-NEXT: [[AX79:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX78]], i32 [[EY56]]) ; CHECK-NEXT: [[TMP95:%.*]] = fptrunc float [[AX79]] to half -; CHECK-NEXT: [[TMP96]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP21]]) -; CHECK-NEXT: br label %[[BB24]] +; CHECK-NEXT: [[TMP104]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_COMPUTE85]]: -; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]]) +; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]]) ; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0 ; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1 ; CHECK-NEXT: [[EX87:%.*]] = sub i32 [[TMP99]], 1 ; CHECK-NEXT: [[AX88:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP98]], i32 11) -; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]]) +; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]]) ; CHECK-NEXT: [[TMP101:%.*]] = extractvalue { float, i32 } [[TMP100]], 0 ; CHECK-NEXT: [[TMP102:%.*]] = extractvalue { float, i32 } [[TMP100]], 1 ; CHECK-NEXT: [[EY89:%.*]] = sub i32 [[TMP102]], 1 @@ -760,10 +760,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP103:%.*]] = icmp sgt i32 [[NB91]], 11 ; CHECK-NEXT: br i1 [[TMP103]], label %[[FREM_LOOP_BODY93:.*]], label %[[FREM_LOOP_EXIT94]] ; CHECK: [[FREM_ELSE86]]: -; CHECK-NEXT: [[TMP104:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]]) -; CHECK-NEXT: [[TMP105:%.*]] = fcmp oeq float [[AX82]], [[AY83]] -; CHECK-NEXT: [[TMP106]] = select i1 [[TMP105]], half [[TMP104]], half [[TMP31]] -; CHECK-NEXT: br label %[[BB34]] +; CHECK-NEXT: [[TMP105:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]]) +; CHECK-NEXT: [[TMP106:%.*]] = fcmp oeq float [[AX1]], [[AY2]] +; CHECK-NEXT: [[TMP115]] = select i1 [[TMP106]], half [[TMP105]], half [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] ; CHECK: [[FREM_LOOP_BODY93]]: ; CHECK-NEXT: [[NB_IV95:%.*]] = phi i32 [ [[NB91]], %[[FREM_COMPUTE85]] ], [ [[NB_UPDATE103:%.*]], %[[FREM_LOOP_BODY93]] ] ; CHECK-NEXT: [[AX_LOOP_PHI96:%.*]] = phi float [ [[AX88]], %[[FREM_COMPUTE85]] ], [ [[AX_UPDATE102:%.*]], %[[FREM_LOOP_BODY93]] ] @@ -793,8 +793,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX111:%.*]] = select i1 [[CLT109]], float [[AXP110]], float [[AX108]] ; CHECK-NEXT: [[AX112:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX111]], i32 [[EY89]]) ; CHECK-NEXT: [[TMP114:%.*]] = fptrunc float [[AX112]] to half -; CHECK-NEXT: [[TMP115]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP31]]) -; CHECK-NEXT: br label %[[BB34]] +; CHECK-NEXT: [[TMP116]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] ; ptr addrspace(1) %in2) { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 @@ -816,9 +816,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) ; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]]) ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]] -; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] ; CHECK: [[BB4:.*]]: -; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP56:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP55:%.*]], %[[FREM_ELSE16]] ] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00 ; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]] ; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) @@ -830,9 +830,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) ; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] ; CHECK: [[BB14:.*]]: -; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ] +; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP45:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP37:%.*]], %[[FREM_ELSE]] ] ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]] ; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) @@ -842,12 +842,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: store <2 x float> [[R2]], ptr addrspace(1) [[OUT]], align 8 ; CHECK-NEXT: ret void ; CHECK: [[FREM_COMPUTE]]: -; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) +; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]]) ; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0 ; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1 ; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1 ; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 12) -; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) +; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]]) ; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 ; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1 @@ -857,10 +857,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 12 ; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] ; CHECK: [[FREM_ELSE]]: -; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]]) -; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX]], [[AY]] -; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], float [[TMP27]], float [[TMP1]] -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]]) +; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq float [[AX12]], [[AY13]] +; CHECK-NEXT: [[TMP37]] = select i1 [[TMP29]], float [[TMP28]], float [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_LOOP_BODY]]: ; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] ; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] @@ -889,15 +889,15 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]] ; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]] ; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]]) -; CHECK-NEXT: [[TMP37]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]]) -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP45]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_COMPUTE15]]: -; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]]) +; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) ; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { float, i32 } [[TMP38]], 0 ; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP38]], 1 ; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1 ; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP39]], i32 12) -; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]]) +; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) ; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP41]], 0 ; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP41]], 1 ; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1 @@ -907,10 +907,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 12 ; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]] ; CHECK: [[FREM_ELSE16]]: -; CHECK-NEXT: [[TMP45:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]]) -; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq float [[AX12]], [[AY13]] -; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], float [[TMP45]], float [[TMP11]] -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP46:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]]) +; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX]], [[AY]] +; CHECK-NEXT: [[TMP55]] = select i1 [[TMP47]], float [[TMP46]], float [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] ; CHECK: [[FREM_LOOP_BODY23]]: ; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ] ; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ] @@ -939,8 +939,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]] ; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]] ; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]]) -; CHECK-NEXT: [[TMP55]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]]) -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP56]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] ; ptr addrspace(1) %in2) { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 @@ -962,9 +962,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) ; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]]) ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]] -; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]] ; CHECK: [[BB4:.*]]: -; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP112:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP111:%.*]], %[[FREM_ELSE78]] ] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00 ; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]] ; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]]) @@ -976,9 +976,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) ; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]] ; CHECK: [[BB14:.*]]: -; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP75:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP67:%.*]], %[[FREM_ELSE16]] ] +; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP101:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP93:%.*]], %[[FREM_ELSE47]] ] ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]] ; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]]) @@ -990,9 +990,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX43:%.*]] = call float @llvm.fabs.f32(float [[TMP21]]) ; CHECK-NEXT: [[AY44:%.*]] = call float @llvm.fabs.f32(float [[TMP22]]) ; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX43]], [[AY44]] -; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]] +; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] ; CHECK: [[BB24:.*]]: -; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP93:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP85:%.*]], %[[FREM_ELSE47]] ] +; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP83:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP75:%.*]], %[[FREM_ELSE16]] ] ; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq float [[TMP22]], 0.000000e+00 ; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float 0x7FF8000000000000, float [[RET45]] ; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.fabs.f32(float [[TMP21]]) @@ -1004,9 +1004,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX74:%.*]] = call float @llvm.fabs.f32(float [[TMP31]]) ; CHECK-NEXT: [[AY75:%.*]] = call float @llvm.fabs.f32(float [[TMP32]]) ; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX74]], [[AY75]] -; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]] +; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] ; CHECK: [[BB34:.*]]: -; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP111:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP103:%.*]], %[[FREM_ELSE78]] ] +; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP65:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP57:%.*]], %[[FREM_ELSE]] ] ; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq float [[TMP32]], 0.000000e+00 ; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float 0x7FF8000000000000, float [[RET76]] ; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.fabs.f32(float [[TMP31]]) @@ -1016,12 +1016,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: store <4 x float> [[R2]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; CHECK: [[FREM_COMPUTE]]: -; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) +; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]]) ; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 ; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 ; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1 ; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 12) -; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) +; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]]) ; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0 ; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1 ; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1 @@ -1031,10 +1031,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 12 ; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] ; CHECK: [[FREM_ELSE]]: -; CHECK-NEXT: [[TMP47:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]]) -; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX]], [[AY]] -; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], float [[TMP47]], float [[TMP1]] -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP48:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]]) +; CHECK-NEXT: [[TMP49:%.*]] = fcmp oeq float [[AX74]], [[AY75]] +; CHECK-NEXT: [[TMP57]] = select i1 [[TMP49]], float [[TMP48]], float [[TMP31]] +; CHECK-NEXT: br label %[[BB34]] ; CHECK: [[FREM_LOOP_BODY]]: ; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] ; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] @@ -1063,15 +1063,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]] ; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]] ; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]]) -; CHECK-NEXT: [[TMP57]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]]) -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP65]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP31]]) +; CHECK-NEXT: br label %[[BB34]] ; CHECK: [[FREM_COMPUTE15]]: -; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]]) +; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]]) ; CHECK-NEXT: [[TMP59:%.*]] = extractvalue { float, i32 } [[TMP58]], 0 ; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP58]], 1 ; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP60]], 1 ; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP59]], i32 12) -; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]]) +; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]]) ; CHECK-NEXT: [[TMP62:%.*]] = extractvalue { float, i32 } [[TMP61]], 0 ; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP61]], 1 ; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP63]], 1 @@ -1081,10 +1081,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[NB21]], 12 ; CHECK-NEXT: br i1 [[TMP64]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]] ; CHECK: [[FREM_ELSE16]]: -; CHECK-NEXT: [[TMP65:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]]) -; CHECK-NEXT: [[TMP66:%.*]] = fcmp oeq float [[AX12]], [[AY13]] -; CHECK-NEXT: [[TMP67]] = select i1 [[TMP66]], float [[TMP65]], float [[TMP11]] -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP66:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]]) +; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX43]], [[AY44]] +; CHECK-NEXT: [[TMP75]] = select i1 [[TMP67]], float [[TMP66]], float [[TMP21]] +; CHECK-NEXT: br label %[[BB24]] ; CHECK: [[FREM_LOOP_BODY23]]: ; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ] ; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ] @@ -1113,15 +1113,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]] ; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]] ; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]]) -; CHECK-NEXT: [[TMP75]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]]) -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP83]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP21]]) +; CHECK-NEXT: br label %[[BB24]] ; CHECK: [[FREM_COMPUTE46]]: -; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]]) +; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]]) ; CHECK-NEXT: [[TMP77:%.*]] = extractvalue { float, i32 } [[TMP76]], 0 ; CHECK-NEXT: [[TMP78:%.*]] = extractvalue { float, i32 } [[TMP76]], 1 ; CHECK-NEXT: [[EX48:%.*]] = sub i32 [[TMP78]], 1 ; CHECK-NEXT: [[AX49:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP77]], i32 12) -; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]]) +; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]]) ; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP79]], 0 ; CHECK-NEXT: [[TMP81:%.*]] = extractvalue { float, i32 } [[TMP79]], 1 ; CHECK-NEXT: [[EY50:%.*]] = sub i32 [[TMP81]], 1 @@ -1131,10 +1131,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[NB52]], 12 ; CHECK-NEXT: br i1 [[TMP82]], label %[[FREM_LOOP_BODY54:.*]], label %[[FREM_LOOP_EXIT55]] ; CHECK: [[FREM_ELSE47]]: -; CHECK-NEXT: [[TMP83:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]]) -; CHECK-NEXT: [[TMP84:%.*]] = fcmp oeq float [[AX43]], [[AY44]] -; CHECK-NEXT: [[TMP85]] = select i1 [[TMP84]], float [[TMP83]], float [[TMP21]] -; CHECK-NEXT: br label %[[BB24]] +; CHECK-NEXT: [[TMP84:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]]) +; CHECK-NEXT: [[TMP85:%.*]] = fcmp oeq float [[AX12]], [[AY13]] +; CHECK-NEXT: [[TMP93]] = select i1 [[TMP85]], float [[TMP84]], float [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_LOOP_BODY54]]: ; CHECK-NEXT: [[NB_IV56:%.*]] = phi i32 [ [[NB52]], %[[FREM_COMPUTE46]] ], [ [[NB_UPDATE64:%.*]], %[[FREM_LOOP_BODY54]] ] ; CHECK-NEXT: [[AX_LOOP_PHI57:%.*]] = phi float [ [[AX49]], %[[FREM_COMPUTE46]] ], [ [[AX_UPDATE63:%.*]], %[[FREM_LOOP_BODY54]] ] @@ -1163,15 +1163,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP71:%.*]] = fadd float [[AX69]], [[AY51]] ; CHECK-NEXT: [[AX72:%.*]] = select i1 [[CLT70]], float [[AXP71]], float [[AX69]] ; CHECK-NEXT: [[AX73:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX72]], i32 [[EY50]]) -; CHECK-NEXT: [[TMP93]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP21]]) -; CHECK-NEXT: br label %[[BB24]] +; CHECK-NEXT: [[TMP101]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_COMPUTE77]]: -; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]]) +; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]]) ; CHECK-NEXT: [[TMP95:%.*]] = extractvalue { float, i32 } [[TMP94]], 0 ; CHECK-NEXT: [[TMP96:%.*]] = extractvalue { float, i32 } [[TMP94]], 1 ; CHECK-NEXT: [[EX79:%.*]] = sub i32 [[TMP96]], 1 ; CHECK-NEXT: [[AX80:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP95]], i32 12) -; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]]) +; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]]) ; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0 ; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1 ; CHECK-NEXT: [[EY81:%.*]] = sub i32 [[TMP99]], 1 @@ -1181,10 +1181,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt i32 [[NB83]], 12 ; CHECK-NEXT: br i1 [[TMP100]], label %[[FREM_LOOP_BODY85:.*]], label %[[FREM_LOOP_EXIT86]] ; CHECK: [[FREM_ELSE78]]: -; CHECK-NEXT: [[TMP101:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]]) -; CHECK-NEXT: [[TMP102:%.*]] = fcmp oeq float [[AX74]], [[AY75]] -; CHECK-NEXT: [[TMP103]] = select i1 [[TMP102]], float [[TMP101]], float [[TMP31]] -; CHECK-NEXT: br label %[[BB34]] +; CHECK-NEXT: [[TMP102:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]]) +; CHECK-NEXT: [[TMP103:%.*]] = fcmp oeq float [[AX]], [[AY]] +; CHECK-NEXT: [[TMP111]] = select i1 [[TMP103]], float [[TMP102]], float [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] ; CHECK: [[FREM_LOOP_BODY85]]: ; CHECK-NEXT: [[NB_IV87:%.*]] = phi i32 [ [[NB83]], %[[FREM_COMPUTE77]] ], [ [[NB_UPDATE95:%.*]], %[[FREM_LOOP_BODY85]] ] ; CHECK-NEXT: [[AX_LOOP_PHI88:%.*]] = phi float [ [[AX80]], %[[FREM_COMPUTE77]] ], [ [[AX_UPDATE94:%.*]], %[[FREM_LOOP_BODY85]] ] @@ -1213,8 +1213,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP102:%.*]] = fadd float [[AX100]], [[AY82]] ; CHECK-NEXT: [[AX103:%.*]] = select i1 [[CLT101]], float [[AXP102]], float [[AX100]] ; CHECK-NEXT: [[AX104:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX103]], i32 [[EY81]]) -; CHECK-NEXT: [[TMP111]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP31]]) -; CHECK-NEXT: br label %[[BB34]] +; CHECK-NEXT: [[TMP112]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] ; ptr addrspace(1) %in2) { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 @@ -1236,9 +1236,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX:%.*]] = call double @llvm.fabs.f64(double [[TMP1]]) ; CHECK-NEXT: [[AY:%.*]] = call double @llvm.fabs.f64(double [[TMP2]]) ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt double [[AX]], [[AY]] -; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] ; CHECK: [[BB4:.*]]: -; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ] +; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP56:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP55:%.*]], %[[FREM_ELSE16]] ] ; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq double [[TMP2]], 0.000000e+00 ; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double 0x7FF8000000000000, double [[RET]] ; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.fabs.f64(double [[TMP1]]) @@ -1250,9 +1250,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AX12:%.*]] = call double @llvm.fabs.f64(double [[TMP11]]) ; CHECK-NEXT: [[AY13:%.*]] = call double @llvm.fabs.f64(double [[TMP12]]) ; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt double [[AX12]], [[AY13]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]] ; CHECK: [[BB14:.*]]: -; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ] +; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP45:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP37:%.*]], %[[FREM_ELSE]] ] ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq double [[TMP12]], 0.000000e+00 ; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], double 0x7FF8000000000000, double [[RET14]] ; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fabs.f64(double [[TMP11]]) @@ -1262,12 +1262,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: store <2 x double> [[R2]], ptr addrspace(1) [[OUT]], align 16 ; CHECK-NEXT: ret void ; CHECK: [[FREM_COMPUTE]]: -; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]]) +; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]]) ; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { double, i32 } [[TMP20]], 0 ; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, i32 } [[TMP20]], 1 ; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1 ; CHECK-NEXT: [[AX1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP21]], i32 26) -; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]]) +; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]]) ; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { double, i32 } [[TMP23]], 0 ; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { double, i32 } [[TMP23]], 1 ; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1 @@ -1277,10 +1277,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 26 ; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]] ; CHECK: [[FREM_ELSE]]: -; CHECK-NEXT: [[TMP27:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]]) -; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq double [[AX]], [[AY]] -; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], double [[TMP27]], double [[TMP1]] -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]]) +; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq double [[AX12]], [[AY13]] +; CHECK-NEXT: [[TMP37]] = select i1 [[TMP29]], double [[TMP28]], double [[TMP11]] +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_LOOP_BODY]]: ; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] ; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ] @@ -1309,15 +1309,15 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP9:%.*]] = fadd double [[AX7]], [[AY2]] ; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], double [[AXP9]], double [[AX7]] ; CHECK-NEXT: [[AX11:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX10]], i32 [[EY]]) -; CHECK-NEXT: [[TMP37]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP1]]) -; CHECK-NEXT: br label %[[BB4]] +; CHECK-NEXT: [[TMP45]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP11]]) +; CHECK-NEXT: br label %[[BB14]] ; CHECK: [[FREM_COMPUTE15]]: -; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]]) +; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]]) ; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { double, i32 } [[TMP38]], 0 ; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { double, i32 } [[TMP38]], 1 ; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1 ; CHECK-NEXT: [[AX18:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP39]], i32 26) -; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]]) +; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]]) ; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { double, i32 } [[TMP41]], 0 ; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { double, i32 } [[TMP41]], 1 ; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1 @@ -1327,10 +1327,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 26 ; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]] ; CHECK: [[FREM_ELSE16]]: -; CHECK-NEXT: [[TMP45:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]]) -; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq double [[AX12]], [[AY13]] -; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], double [[TMP45]], double [[TMP11]] -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP46:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]]) +; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq double [[AX]], [[AY]] +; CHECK-NEXT: [[TMP55]] = select i1 [[TMP47]], double [[TMP46]], double [[TMP1]] +; CHECK-NEXT: br label %[[BB4]] ; CHECK: [[FREM_LOOP_BODY23]]: ; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ] ; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi double [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ] @@ -1359,8 +1359,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CHECK-NEXT: [[AXP40:%.*]] = fadd double [[AX38]], [[AY20]] ; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], double [[AXP40]], double [[AX38]] ; CHECK-NEXT: [[AX42:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX41]], i32 [[EY19]]) -; CHECK-NEXT: [[TMP55]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP11]]) -; CHECK-NEXT: br label %[[BB14]] +; CHECK-NEXT: [[TMP56]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP1]]) +; CHECK-NEXT: br label %[[BB4]] ; ptr addrspace(1) %in2) { %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll index fae1365..e1d39fd 100644 --- a/llvm/test/Transforms/InstCombine/add-sitofp.ll +++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll @@ -99,12 +99,15 @@ define float @test_3(i32 %a, i32 %b) { ret float %p } +; Don't perform the fold on vector operations, as the integer op may be +; much more expensive than the float op in that case. define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @test_4( ; CHECK-NEXT: [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 1073741823) ; CHECK-NEXT: [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], splat (i32 1073741823) -; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]] -; CHECK-NEXT: [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double> +; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x double> +; CHECK-NEXT: [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x double> +; CHECK-NEXT: [[RES:%.*]] = fadd <4 x double> [[A_AND_FP]], [[B_AND_FP]] ; CHECK-NEXT: ret <4 x double> [[RES]] ; ; Drop two highest bits to guarantee that %a + %b doesn't overflow diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll index 702bbbb..57184ea 100644 --- a/llvm/test/Transforms/InstCombine/binop-itofp.ll +++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll @@ -1063,6 +1063,25 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345) ret float %mul3.i.i } +; Don't perform the fold on vector operations, as the integer op may be +; much more expensive than the float op in that case. +define <2 x half> @test_ui_ui_i8_mul_vec(<2 x i8> noundef %x_in, <2 x i8> noundef %y_in) { +; CHECK-LABEL: @test_ui_ui_i8_mul_vec( +; CHECK-NEXT: [[X:%.*]] = and <2 x i8> [[X_IN:%.*]], splat (i8 15) +; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[Y_IN:%.*]], splat (i8 15) +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg <2 x i8> [[X]] to <2 x half> +; CHECK-NEXT: [[YF:%.*]] = uitofp nneg <2 x i8> [[Y]] to <2 x half> +; CHECK-NEXT: [[R:%.*]] = fmul <2 x half> [[XF]], [[YF]] +; CHECK-NEXT: ret <2 x half> [[R]] +; + %x = and <2 x i8> %x_in, splat (i8 15) + %y = and <2 x i8> %y_in, splat (i8 15) + %xf = uitofp <2 x i8> %x to <2 x half> + %yf = uitofp <2 x i8> %y to <2 x half> + %r = fmul <2 x half> %xf, %yf + ret <2 x half> %r +} + define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_poison(i1 %c, i1 %.b, ptr %g_2345) { ; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_poison( ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264 @@ -1091,8 +1110,9 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_poison(i1 %c, ; CHECK-NEXT: [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0 ; CHECK-NEXT: [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[MUL3_I_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float> +; CHECK-NEXT: [[MUL3_I_I1:%.*]] = fmul <2 x float> [[MUL3_I_I]], <float poison, float 1.000000e+00> ; CHECK-NEXT: store i32 [[SEL]], ptr [[G_2345:%.*]], align 4 -; CHECK-NEXT: ret <2 x float> [[MUL3_I_I]] +; CHECK-NEXT: ret <2 x float> [[MUL3_I_I1]] ; %sel = select i1 %c, i32 65529, i32 53264 %conv.i.s = trunc i32 %sel to i16 diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index aed1e29..4db3d1e 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1374,3 +1374,49 @@ exit.1: exit.2: ret i16 1 } + +; Loop with a switch terminator in the latch block. Cannot be vectorized +; currently. +; Test case for https://github.com/llvm/llvm-project/issues/156894. +define void @switch_in_latch(ptr %a) { +; CHECK-LABEL: @switch_in_latch( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: switch i32 [[IV_NEXT]], label [[LOOP]] [ +; CHECK-NEXT: i32 100, label [[EXIT:%.*]] +; CHECK-NEXT: ] +; CHECK: exit: +; CHECK-NEXT: ret void +; +; TAILFOLD-LABEL: @switch_in_latch( +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[LOOP:%.*]] +; TAILFOLD: loop: +; TAILFOLD-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[IV]] +; TAILFOLD-NEXT: store i32 1, ptr [[GEP]], align 4 +; TAILFOLD-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; TAILFOLD-NEXT: switch i32 [[IV_NEXT]], label [[LOOP]] [ +; TAILFOLD-NEXT: i32 100, label [[EXIT:%.*]] +; TAILFOLD-NEXT: ] +; TAILFOLD: exit: +; TAILFOLD-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr i32, ptr %a, i32 %iv + store i32 1, ptr %gep, align 4 + %iv.next = add i32 %iv, 1 + switch i32 %iv.next, label %loop [i32 100, label %exit] + +exit: + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/non-commutative-second-arg-only-copyable.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/non-commutative-second-arg-only-copyable.ll new file mode 100644 index 0000000..0561466 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/non-commutative-second-arg-only-copyable.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-9999 -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s + +define i32 @main(ptr %q, ptr %a, i8 %.pre) { +; CHECK-LABEL: define i32 @main( +; CHECK-SAME: ptr [[Q:%.*]], ptr [[A:%.*]], i8 [[DOTPRE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DOTPRE1:%.*]] = load i8, ptr [[Q]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[DOTPRE]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[DOTPRE1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], <i32 0, i32 1> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> <i32 poison, i32 1>, <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = shl <2 x i32> [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; CHECK-NEXT: store <2 x i16> [[TMP6]], ptr [[A]], align 2 +; CHECK-NEXT: ret i32 0 +; +entry: + %.pre1 = load i8, ptr %q, align 1 + %conv11.i = sext i8 %.pre to i32 + %shl18.i = shl i32 %conv11.i, %conv11.i + %conv19.i = trunc i32 %shl18.i to i16 + store i16 %conv19.i, ptr %a, align 2 + %0 = sext i8 %.pre1 to i32 + %1 = add i32 %0, 1 + %shl18.i.1 = shl i32 1, %1 + %conv19.i.1 = trunc i32 %shl18.i.1 to i16 + %arrayidx21.i.1 = getelementptr i8, ptr %a, i64 2 + store i16 %conv19.i.1, ptr %arrayidx21.i.1, align 2 + ret i32 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-phi-node-reordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-phi-node-reordered.ll new file mode 100644 index 0000000..d01c35f --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-phi-node-reordered.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(i32 %arg, i32 %arg1) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 [[ARG:%.*]], i32 [[ARG1:%.*]]) { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[ARG]], i32 0 +; CHECK-NEXT: br label %[[BB6:.*]] +; CHECK: [[BB2:.*]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ [[TMP14:%.*]], %[[BB19:.*]] ] +; CHECK-NEXT: ret void +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP17:%.*]], %[[BB26:.*]] ], [ [[TMP16:%.*]], %[[BB27:.*]] ], [ zeroinitializer, %[[BB25:.*]] ] +; CHECK-NEXT: switch i8 0, label %[[BB11:.*]] [ +; CHECK-NEXT: i8 0, label %[[BB28:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB11]]: +; CHECK-NEXT: [[PHI12:%.*]] = phi i32 [ 0, %[[BB28]] ], [ 0, %[[BB6]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ [[TMP3]], %[[BB28]] ], [ zeroinitializer, %[[BB6]] ] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, <4 x i32> <i32 0, i32 5, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[ARG]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[PHI12]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 0, i32 poison, i32 0>, <4 x i32> <i32 poison, i32 5, i32 2, i32 7> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP1]], <4 x i32> <i32 4, i32 1, i32 2, i32 3> +; CHECK-NEXT: switch i8 0, label %[[BB19]] [ +; CHECK-NEXT: i8 1, label %[[BB17:.*]] +; CHECK-NEXT: i8 0, label %[[BB18:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB17]]: +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 3, i32 6, i32 poison> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7> +; CHECK-NEXT: br label %[[BB19]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 0> +; CHECK-NEXT: br label %[[BB19]] +; CHECK: [[BB19]]: +; CHECK-NEXT: [[TMP14]] = phi <4 x i32> [ [[TMP10]], %[[BB17]] ], [ [[TMP7]], %[[BB18]] ], [ [[TMP9]], %[[BB11]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP12]], %[[BB17]] ], [ [[TMP13]], %[[BB18]] ], [ [[TMP7]], %[[BB11]] ] +; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1> +; CHECK-NEXT: br i1 false, label %[[BB2]], label %[[BB25]] +; CHECK: [[BB25]]: +; CHECK-NEXT: switch i8 0, label %[[BB6]] [ +; CHECK-NEXT: i8 0, label %[[BB26]] +; CHECK-NEXT: i8 1, label %[[BB27]] +; CHECK-NEXT: i8 6, label %[[BB27]] +; CHECK-NEXT: ] +; CHECK: [[BB26]]: +; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP0]], <4 x i32> <i32 4, i32 1, i32 2, i32 3> +; CHECK-NEXT: br label %[[BB6]] +; CHECK: [[BB27]]: +; CHECK-NEXT: br label %[[BB6]] +; CHECK: [[BB28]]: +; CHECK-NEXT: br label %[[BB11]] +; +bb: + br label %bb6 + +bb2: + %phi = phi i32 [ %phi21, %bb19 ] + %phi3 = phi i32 [ %phi22, %bb19 ] + %phi4 = phi i32 [ %phi23, %bb19 ] + %phi5 = phi i32 [ %phi24, %bb19 ] + ret void + +bb6: + %phi7 = phi i32 [ 0, %bb ], [ %phi24, %bb26 ], [ %phi24, %bb27 ], [ 0, %bb25 ] + %phi8 = phi i32 [ 0, %bb ], [ %arg1, %bb26 ], [ %phi23, %bb27 ], [ 0, %bb25 ] + %phi9 = phi i32 [ 0, %bb ], [ %phi22, %bb26 ], [ %phi20, %bb27 ], [ 0, %bb25 ] + %phi10 = phi i32 [ 0, %bb ], [ %phi21, %bb26 ], [ %phi21, %bb27 ], [ 0, %bb25 ] + switch i8 0, label %bb11 [ + i8 0, label %bb28 + ] + +bb11: + %phi12 = phi i32 [ 0, %bb28 ], [ 0, %bb6 ] + %phi13 = phi i32 [ %phi10, %bb28 ], [ 0, %bb6 ] + %phi14 = phi i32 [ %phi9, %bb28 ], [ 0, %bb6 ] + %phi15 = phi i32 [ %phi8, %bb28 ], [ 0, %bb6 ] + %phi16 = phi i32 [ %phi7, %bb28 ], [ 0, %bb6 ] + switch i8 0, label %bb19 [ + i8 1, label %bb17 + i8 0, label %bb18 + ] + +bb17: + %add = add i32 %phi16, 0 + br label %bb19 + +bb18: + br label %bb19 + +bb19: + %phi20 = phi i32 [ 0, %bb17 ], [ %arg, %bb18 ], [ %phi12, %bb11 ] + %phi21 = phi i32 [ %phi13, %bb17 ], [ %phi12, %bb18 ], [ 0, %bb11 ] + %phi22 = phi i32 [ %phi14, %bb17 ], [ 0, %bb18 ], [ 0, %bb11 ] + %phi23 = phi i32 [ %phi15, %bb17 ], [ %arg, %bb18 ], [ %arg, %bb11 ] + %phi24 = phi i32 [ %add, %bb17 ], [ %phi16, %bb18 ], [ %phi16, %bb11 ] + br i1 false, label %bb2, label %bb25 + +bb25: + switch i8 0, label %bb6 [ + i8 0, label %bb26 + i8 1, label %bb27 + i8 6, label %bb27 + ] + +bb26: + br label %bb6 + +bb27: + br label %bb6 + +bb28: + br label %bb11 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll index d626230..5253f9f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-nodes-incoming-same-blocks.ll @@ -6,7 +6,7 @@ define void @test(ptr %0, i1 %1, i1 %2) { ; CHECK-SAME: ptr [[TMP0:%.*]], i1 [[TMP1:%.*]], i1 [[TMP2:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: br label %[[BB4:.*]] ; CHECK: [[BB4]]: -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], %[[TMP7:.*]] ], [ zeroinitializer, [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], %[[TMP7:.*]] ], [ zeroinitializer, [[TMP3:%.*]] ] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1> ; CHECK-NEXT: br i1 [[TMP1]], label %[[TMP7]], label %[[BB15:.*]] ; CHECK: [[TMP7]]: @@ -14,9 +14,9 @@ define void @test(ptr %0, i1 %1, i1 %2) { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16 ; CHECK-NEXT: [[TMP10:%.*]] = load <2 x i32>, ptr [[TMP9]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = or <2 x i32> [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP12]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 poison>, <2 x i32> <i32 2, i32 1> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> +; CHECK-NEXT: [[TMP15]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> <i32 1, i32 poison>, <2 x i32> <i32 2, i32 1> ; CHECK-NEXT: br i1 [[TMP2]], label %[[BB16:.*]], label %[[BB4]] ; CHECK: [[BB15]]: ; CHECK-NEXT: br label %[[BB16]] diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected index 429bee4..a8c2531 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected @@ -65,8 +65,8 @@ define dso_local i32 @main() #0 { attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-LABEL: check_boundaries: -; CHECK: check_boundaries$local: -; CHECK-NEXT: .type check_boundaries$local,@function +; CHECK: .Lcheck_boundaries$local: +; CHECK-NEXT: .type .Lcheck_boundaries$local,@function ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -107,8 +107,8 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" } ; CHECK-NEXT: s_setpc_b64 s[30:31] ; ; CHECK-LABEL: main: -; CHECK: main$local: -; CHECK-NEXT: .type main$local,@function +; CHECK: .Lmain$local: +; CHECK-NEXT: .type .Lmain$local,@function ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected index 842fd88..34530f2 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected @@ -6,8 +6,8 @@ define dso_local i32 @check_boundaries() #0 { ; CHECK-LABEL: check_boundaries: -; CHECK: check_boundaries$local: -; CHECK-NEXT: .type check_boundaries$local,@function +; CHECK: .Lcheck_boundaries$local: +; CHECK-NEXT: .type .Lcheck_boundaries$local,@function ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -84,8 +84,8 @@ define dso_local i32 @check_boundaries() #0 { define dso_local i32 @main() #0 { ; CHECK-LABEL: main: -; CHECK: main$local: -; CHECK-NEXT: .type main$local,@function +; CHECK: .Lmain$local: +; CHECK-NEXT: .type .Lmain$local,@function ; CHECK-NEXT: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 731d648..b7f898f 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -623,8 +623,9 @@ public: }); } - char *prepare(ExecutorAddr Addr, size_t ContentSize) override { - return InProcessMemoryMapper::prepare(Addr - DeltaAddr, ContentSize); + char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr, + size_t ContentSize) override { + return InProcessMemoryMapper::prepare(G, Addr - DeltaAddr, ContentSize); } void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override { diff --git a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp index c5e9d43..a5269f7 100644 --- a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp @@ -39,8 +39,8 @@ public: return Mapper->initialize(AI, std::move(OnInitialized)); } - char *prepare(ExecutorAddr Addr, size_t ContentSize) override { - return Mapper->prepare(Addr, ContentSize); + char *prepare(LinkGraph &G, ExecutorAddr Addr, size_t ContentSize) override { + return Mapper->prepare(G, Addr, ContentSize); } void deinitialize(ArrayRef<ExecutorAddr> Allocations, diff --git a/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp index fea9eab..1174493 100644 --- a/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/MemoryMapper.h" +#include "llvm/ExecutionEngine/JITLink/JITLink.h" #include "llvm/Support/Process.h" #include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" @@ -66,6 +67,9 @@ TEST(MemoryMapperTest, InitializeDeinitialize) { { std::unique_ptr<MemoryMapper> Mapper = cantFail(InProcessMemoryMapper::Create()); + jitlink::LinkGraph G("G", std::make_shared<SymbolStringPool>(), + Triple("x86_64-apple-darwin"), SubtargetFeatures(), + jitlink::getGenericEdgeKindName); // We will do two separate allocations auto PageSize = Mapper->getPageSize(); @@ -80,7 +84,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) { { // Provide working memory - char *WA1 = Mapper->prepare(Mem1->Start, HW.size() + 1); + char *WA1 = Mapper->prepare(G, Mem1->Start, HW.size() + 1); std::strcpy(WA1, HW.c_str()); } @@ -105,7 +109,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) { } { - char *WA2 = Mapper->prepare(Mem1->Start + PageSize, HW.size() + 1); + char *WA2 = Mapper->prepare(G, Mem1->Start + PageSize, HW.size() + 1); std::strcpy(WA2, HW.c_str()); } @@ -158,7 +162,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) { auto Mem2 = reserve(*Mapper, PageSize); EXPECT_THAT_ERROR(Mem2.takeError(), Succeeded()); - char *WA = Mapper->prepare(Mem2->Start, HW.size() + 1); + char *WA = Mapper->prepare(G, Mem2->Start, HW.size() + 1); std::strcpy(WA, HW.c_str()); MemoryMapper::AllocInfo Alloc3; diff --git a/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp index 700500f..7775f3c 100644 --- a/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp @@ -8,6 +8,7 @@ #include "OrcTestCommon.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX +#include "llvm/ExecutionEngine/JITLink/JITLink.h" #include "llvm/ExecutionEngine/Orc/MemoryMapper.h" #include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h" @@ -67,12 +68,16 @@ TEST(SharedMemoryMapperTest, MemReserveInitializeDeinitializeRelease) { auto PageSize = Mapper->getPageSize(); size_t ReqSize = PageSize; + jitlink::LinkGraph G("G", std::make_shared<SymbolStringPool>(), + Triple("x86_64-apple-darwin"), SubtargetFeatures(), + jitlink::getGenericEdgeKindName); Mapper->reserve(ReqSize, [&](Expected<ExecutorAddrRange> Result) { EXPECT_THAT_ERROR(Result.takeError(), Succeeded()); auto Reservation = std::move(*Result); { - char *Addr = Mapper->prepare(Reservation.Start, TestString.size() + 1); + char *Addr = + Mapper->prepare(G, Reservation.Start, TestString.size() + 1); std::strcpy(Addr, TestString.c_str()); } MemoryMapper::AllocInfo AI; diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp index fd8ddb1..3938d39 100644 --- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp +++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp @@ -592,7 +592,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls( DenseMap<PredicateWithCC, LibcallsWithCC> Pred2Funcs; SmallVector<uint64_t, 32> BitsetValues( - divideCeil(RuntimeLibcallImplDefList.size(), BitsPerStorageElt)); + divideCeil(RuntimeLibcallImplDefList.size() + 1, BitsPerStorageElt)); for (const Record *Elt : *Elements) { const RuntimeLibcallImpl *LibCallImpl = getRuntimeLibcallImpl(Elt); diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index 6f809c5..e9fd132 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -205,9 +205,10 @@ class ClangFormatHelper(FormatHelper): @property def instructions(self) -> str: - # TODO(boomanaiden154): Add --diff_from_common_commit option when it has - # landed as in available in a released version. - return " ".join(self._construct_command(["origin/main", "HEAD"])) + return ( + " ".join(self._construct_command(["origin/main", "HEAD"])) + + " --diff_from_common_commit" + ) def should_include_extensionless_file(self, path: str) -> bool: return path.startswith("libcxx/include") diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn index 1afd342..c9f3a074 100644 --- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn @@ -31,6 +31,7 @@ unittest("ClangAnalysisFlowSensitiveTests") { "LoggerTest.cpp", "MapLatticeTest.cpp", "MatchSwitchTest.cpp", + "MockHeaders.cpp", "MultiVarConstantPropagationTest.cpp", "RecordOpsTest.cpp", "SignAnalysisTest.cpp", diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index e048caa..cbff478d 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -220,7 +220,7 @@ syn keyword llvmError getresult begin end syn match llvmNoName /[%@!]\d\+\>/ syn match llvmNumber /-\?\<\d\+\>/ syn match llvmFloat /-\?\<\d\+\.\d*\(e[+-]\d\+\)\?\>/ -syn match llvmFloat /\<0x[KLMHR]\?\x\+\>/ +syn match llvmFloat /\<\(u\|s\)\?0x[KLMHR]\?\x\+\>/ syn keyword llvmBoolean true false syn keyword llvmConstant zeroinitializer undef null none poison vscale syn match llvmComment /;.*$/ diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md index 893c6d4..7e6a466a 100644 --- a/mlir/docs/Bindings/Python.md +++ b/mlir/docs/Bindings/Python.md @@ -1188,6 +1188,21 @@ which can be `import`ed from the main dialect file, i.e. `python/mlir/dialects/<dialect-namespace>/passes.py` if it is undesirable to make the passes available along with the dialect. +## Extending MLIR in Python + +The MLIR Python bindings provide support for defining custom components in Python, +mainly including dialects, passes, and rewrite patterns. +The following sections outline how each of these can be implemented. + +### Dialects + +Dialects can be defined through the IRDL dialect bindings in Python. +The IRDL bindings offer a `load_dialects` function that +converts an MLIR module containing `irdl.dialect` ops into MLIR dialects. +For further details, see the documentation of [the IRDL dialect](../Dialects/IRDL.md). + +### Passes + Passes can be defined as Python callables via the `PassManager.add` API. In such case, the callable is wrapped as an `mlir::Pass` internally and executed as part of the pass pipeline when `PassManager.run` is invoked. @@ -1209,6 +1224,44 @@ pm.add('some-cpp-defined-passes') pm.run(some_op) ``` +### Rewrite Patterns + +Rewrite patterns can be registered via the `add` method +of `mlir.rewrite.RewritePatternSet` in Python. +This method takes the operation type to be rewritten +and a Python callable that defines the *match and rewrite* logic. +Note that the Python callable should be defined so that +the rewrite is applied if and only if the match succeeds, +which corresponds to the return value being castable to `False`. + +The `RewritePatternSet` can be converted into +a `FrozenRewritePatternSet` using the `freeze` method, +which can be applied to an operation through +the greedy pattern driver using `apply_patterns_and_fold_greedily`. +The following example demonstrates the typical usage: + +```python +def to_muli(op, rewriter): + with rewriter.ip: + new_op = arith.muli(op.lhs, op.rhs, loc=op.location) + rewriter.replace_op(op, new_op) + +patterns = RewritePatternSet() +patterns.add(arith.AddIOp, to_muli) # Rewrite arith.addi into arith.muli +patterns.add(...) +frozen = patterns.freeze() + +module = ... +apply_patterns_and_fold_greedily(module, frozen) +``` + +The PDL dialect bindings also enable defining and generating rewrite patterns in Python. +The `mlir.rewrite.PDLModule` class accepts a module containing `pdl.pattern` ops, +which can be transformed into a `FrozenRewritePatternSet` using the `freeze` method. +This frozen set can then be applied to an operation +using the greedy rewrite pattern driver via `apply_patterns_and_fold_greedily`. +For further information, see [the PDL dialect documentation](/docs/Dialects/PDLOps/). + ### Other functionality Dialect functionality other than IR objects or passes, such as helper functions, diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h index 3f8874d..1a33ecf 100644 --- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h @@ -518,6 +518,10 @@ private: template <typename StateT> class SparseBackwardDataFlowAnalysis : public AbstractSparseBackwardDataFlowAnalysis { + static_assert( + std::is_base_of<AbstractSparseLattice, StateT>::value, + "analysis state class expected to subclass AbstractSparseLattice"); + public: explicit SparseBackwardDataFlowAnalysis(DataFlowSolver &solver, SymbolTableCollection &symbolTable) diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp index 47685567..5ddb3fb 100644 --- a/mlir/lib/Bindings/Python/Rewrite.cpp +++ b/mlir/lib/Bindings/Python/Rewrite.cpp @@ -197,7 +197,12 @@ public: MlirPatternRewriter rewriter, void *userData) -> MlirLogicalResult { nb::handle f(static_cast<PyObject *>(userData)); - nb::object res = f(op, PyPatternRewriter(rewriter)); + + PyMlirContextRef ctx = + PyMlirContext::forContext(mlirOperationGetContext(op)); + nb::object opView = PyOperation::forOperation(ctx, op)->createOpView(); + + nb::object res = f(opView, PyPatternRewriter(rewriter)); return logicalResultFromObject(res); }; MlirRewritePattern pattern = mlirOpRewritePattenCreate( diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp index 5fe5f41..1243511 100644 --- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp +++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp @@ -357,11 +357,6 @@ static bool shouldBeInlined(ExpressionOp expressionOp) { if (expressionOp.getDoNotInline()) return false; - // Do not inline expressions with side effects to prevent side-effect - // reordering. - if (expressionOp.hasSideEffects()) - return false; - // Do not inline expressions with multiple uses. Value result = expressionOp.getResult(); if (!result.hasOneUse()) @@ -377,7 +372,34 @@ static bool shouldBeInlined(ExpressionOp expressionOp) { // Do not inline expressions used by other expressions or by ops with the // CExpressionInterface. If this was intended, the user could have been merged // into the expression op. - return !isa<emitc::ExpressionOp, emitc::CExpressionInterface>(*user); + if (isa<emitc::ExpressionOp, emitc::CExpressionInterface>(*user)) + return false; + + // Expressions with no side-effects can safely be inlined. + if (!expressionOp.hasSideEffects()) + return true; + + // Expressions with side-effects can be only inlined if side-effect ordering + // in the program is provably retained. + + // Require the user to immediately follow the expression. + if (++Block::iterator(expressionOp) != Block::iterator(user)) + return false; + + // These single-operand ops are safe. + if (isa<emitc::IfOp, emitc::SwitchOp, emitc::ReturnOp>(user)) + return true; + + // For assignment look for specific cases to inline as evaluation order of + // its lvalue and rvalue is undefined in C. + if (auto assignOp = dyn_cast<emitc::AssignOp>(user)) { + // Inline if this assignment is of the form `<var> = <expression>`. + if (expressionOp.getResult() == assignOp.getValue() && + isa_and_present<VariableOp>(assignOp.getVar().getDefiningOp())) + return true; + } + + return false; } static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation, diff --git a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp index e3f075f..8ecb084 100644 --- a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp +++ b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp @@ -464,12 +464,6 @@ static std::string generateOpDefinition(irdl::detail::dictionary &dict, auto opStrings = getStrings(op); fillDict(dict, opStrings); - const auto operandCount = opStrings.opOperandNames.size(); - const auto operandNames = - operandCount ? joinNameList(opStrings.opOperandNames) : "{\"\"}"; - - const auto resultNames = joinNameList(opStrings.opResultNames); - auto resultTypes = llvm::join( llvm::map_range(opStrings.opResultNames, [](StringRef attr) -> std::string { diff --git a/mlir/python/mlir/dialects/arith.py b/mlir/python/mlir/dialects/arith.py index 92da5df..88e8502 100644 --- a/mlir/python/mlir/dialects/arith.py +++ b/mlir/python/mlir/dialects/arith.py @@ -92,7 +92,7 @@ class ConstantOp(ConstantOp): @property def value(self): - return Attribute(self.operation.attributes["value"]) + return self.operation.attributes["value"] @property def literal_value(self) -> Union[int, float]: diff --git a/mlir/test/Target/Cpp/expressions.mlir b/mlir/test/Target/Cpp/expressions.mlir index 4281f41..9f1c816 100644 --- a/mlir/test/Target/Cpp/expressions.mlir +++ b/mlir/test/Target/Cpp/expressions.mlir @@ -315,16 +315,13 @@ func.func @different_expressions(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32) } // CPP-DEFAULT: int32_t expression_with_dereference(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2]]) { -// CPP-DEFAULT-NEXT: int32_t [[VAL_3:v[0-9]+]] = *([[VAL_2]] - [[VAL_1]]); -// CPP-DEFAULT-NEXT: return [[VAL_3]]; +// CPP-DEFAULT-NEXT: return *([[VAL_2]] - [[VAL_1]]); // CPP-DEFAULT-NEXT: } // CPP-DECLTOP: int32_t expression_with_dereference(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2]]) { -// CPP-DECLTOP-NEXT: int32_t [[VAL_3:v[0-9]+]]; -// CPP-DECLTOP-NEXT: [[VAL_3]] = *([[VAL_2]] - [[VAL_1]]); -// CPP-DECLTOP-NEXT: return [[VAL_3]]; +// CPP-DECLTOP-NEXT: return *([[VAL_2]] - [[VAL_1]]); // CPP-DECLTOP-NEXT: } -func.func @expression_with_dereference(%arg1: i32, %arg2: !emitc.ptr<i32>) -> i32 { +emitc.func @expression_with_dereference(%arg1: i32, %arg2: !emitc.ptr<i32>) -> i32 { %c = emitc.expression %arg1, %arg2 : (i32, !emitc.ptr<i32>) -> i32 { %e = emitc.sub %arg2, %arg1 : (!emitc.ptr<i32>, i32) -> !emitc.ptr<i32> %d = emitc.apply "*"(%e) : (!emitc.ptr<i32>) -> i32 @@ -384,19 +381,16 @@ func.func @expression_with_subscript_user(%arg0: !emitc.ptr<!emitc.opaque<"void" // CPP-DEFAULT: bool expression_with_load(int32_t [[VAL_1:v.+]], int32_t [[VAL_2:v.+]], int32_t* [[VAL_3:v.+]]) { // CPP-DEFAULT-NEXT: int64_t [[VAL_4:v.+]] = 0; // CPP-DEFAULT-NEXT: int32_t [[VAL_5:v.+]] = 42; -// CPP-DEFAULT-NEXT: bool [[VAL_6:v.+]] = [[VAL_5]] + [[VAL_2]] < [[VAL_3]][[[VAL_4]]] + [[VAL_1]]; -// CPP-DEFAULT-NEXT: return [[VAL_6]]; +// CPP-DEFAULT-NEXT: return [[VAL_5]] + [[VAL_2]] < [[VAL_3]][[[VAL_4]]] + [[VAL_1]]; // CPP-DECLTOP: bool expression_with_load(int32_t [[VAL_1:v.+]], int32_t [[VAL_2:v.+]], int32_t* [[VAL_3:v.+]]) { // CPP-DECLTOP-NEXT: int64_t [[VAL_4:v.+]]; // CPP-DECLTOP-NEXT: int32_t [[VAL_5:v.+]]; -// CPP-DECLTOP-NEXT: bool [[VAL_6:v.+]]; // CPP-DECLTOP-NEXT: [[VAL_4]] = 0; // CPP-DECLTOP-NEXT: [[VAL_5]] = 42; -// CPP-DECLTOP-NEXT: [[VAL_6]] = [[VAL_5]] + [[VAL_2]] < [[VAL_3]][[[VAL_4]]] + [[VAL_1]]; -// CPP-DECLTOP-NEXT: return [[VAL_6]]; +// CPP-DECLTOP-NEXT: return [[VAL_5]] + [[VAL_2]] < [[VAL_3]][[[VAL_4]]] + [[VAL_1]]; -func.func @expression_with_load(%arg0: i32, %arg1: i32, %arg2: !emitc.ptr<i32>) -> i1 { +emitc.func @expression_with_load(%arg0: i32, %arg1: i32, %arg2: !emitc.ptr<i32>) -> i1 { %c0 = "emitc.constant"() {value = 0 : i64} : () -> i64 %0 = "emitc.variable"() <{value = #emitc.opaque<"42">}> : () -> !emitc.lvalue<i32> %ptr = emitc.subscript %arg2[%c0] : (!emitc.ptr<i32>, i64) -> !emitc.lvalue<i32> @@ -408,22 +402,19 @@ func.func @expression_with_load(%arg0: i32, %arg1: i32, %arg2: !emitc.ptr<i32>) %e = emitc.cmp lt, %b, %d :(i32, i32) -> i1 yield %e : i1 } - return %result : i1 + emitc.return %result : i1 } // CPP-DEFAULT: bool expression_with_load_and_call(int32_t* [[VAL_1:v.+]]) { // CPP-DEFAULT-NEXT: int64_t [[VAL_2:v.+]] = 0; -// CPP-DEFAULT-NEXT: bool [[VAL_3:v.+]] = [[VAL_1]][[[VAL_2]]] + bar([[VAL_1]][[[VAL_2]]]) < [[VAL_1]][[[VAL_2]]]; -// CPP-DEFAULT-NEXT: return [[VAL_3]]; +// CPP-DEFAULT-NEXT: return [[VAL_1]][[[VAL_2]]] + bar([[VAL_1]][[[VAL_2]]]) < [[VAL_1]][[[VAL_2]]]; // CPP-DECLTOP: bool expression_with_load_and_call(int32_t* [[VAL_1:v.+]]) { // CPP-DECLTOP-NEXT: int64_t [[VAL_2:v.+]]; -// CPP-DECLTOP-NEXT: bool [[VAL_3:v.+]]; // CPP-DECLTOP-NEXT: [[VAL_2]] = 0; -// CPP-DECLTOP-NEXT: [[VAL_3]] = [[VAL_1]][[[VAL_2]]] + bar([[VAL_1]][[[VAL_2]]]) < [[VAL_1]][[[VAL_2]]]; -// CPP-DECLTOP-NEXT: return [[VAL_3]]; +// CPP-DECLTOP-NEXT: return [[VAL_1]][[[VAL_2]]] + bar([[VAL_1]][[[VAL_2]]]) < [[VAL_1]][[[VAL_2]]]; -func.func @expression_with_load_and_call(%arg0: !emitc.ptr<i32>) -> i1 { +emitc.func @expression_with_load_and_call(%arg0: !emitc.ptr<i32>) -> i1 { %c0 = "emitc.constant"() {value = 0 : i64} : () -> i64 %ptr = emitc.subscript %arg0[%c0] : (!emitc.ptr<i32>, i64) -> !emitc.lvalue<i32> %result = emitc.expression %ptr : (!emitc.lvalue<i32>) -> i1 { @@ -435,7 +426,7 @@ func.func @expression_with_load_and_call(%arg0: !emitc.ptr<i32>) -> i1 { %f = emitc.cmp lt, %e, %b :(i32, i32) -> i1 yield %f : i1 } - return %result : i1 + emitc.return %result : i1 } @@ -458,3 +449,204 @@ emitc.func @expression_with_call_opaque_with_args_array(%0 : i32, %1 : i32) { } return } + +// CPP-DEFAULT: void inline_side_effects_into_assign(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2:v[0-9]+]]) { +// CPP-DEFAULT-NEXT: int64_t [[VAL_3:v[0-9]+]] = 0; +// CPP-DEFAULT-NEXT: int32_t [[VAL_4:v[0-9]+]] = 42; +// CPP-DEFAULT-NEXT: [[VAL_4]] = [[VAL_4]] * [[VAL_1]] + [[VAL_2]][[[VAL_3]]]; +// CPP-DEFAULT-NEXT: return; +// CPP-DEFAULT-NEXT: } + +// CPP-DECLTOP: void inline_side_effects_into_assign(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2:v[0-9]+]]) { +// CPP-DECLTOP-NEXT: int64_t [[VAL_3:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_4:v[0-9]+]]; +// CPP-DECLTOP-NEXT: [[VAL_3]] = 0; +// CPP-DECLTOP-NEXT: [[VAL_4]] = 42; +// CPP-DECLTOP-NEXT: [[VAL_4]] = [[VAL_4]] * [[VAL_1]] + [[VAL_2]][[[VAL_3]]]; +// CPP-DECLTOP-NEXT: return; +// CPP-DECLTOP-NEXT: } + +emitc.func @inline_side_effects_into_assign(%arg0: i32, %arg1: !emitc.ptr<i32>) { + %c0 = "emitc.constant"() {value = 0 : i64} : () -> i64 + %0 = "emitc.variable"() <{value = #emitc.opaque<"42">}> : () -> !emitc.lvalue<i32> + %ptr = emitc.subscript %arg1[%c0] : (!emitc.ptr<i32>, i64) -> !emitc.lvalue<i32> + %result = emitc.expression %arg0, %0, %ptr : (i32, !emitc.lvalue<i32>, !emitc.lvalue<i32>) -> i32 { + %a = emitc.load %0 : !emitc.lvalue<i32> + %b = emitc.mul %a, %arg0 : (i32, i32) -> i32 + %c = emitc.load %ptr : !emitc.lvalue<i32> + %d = emitc.add %b, %c : (i32, i32) -> i32 + yield %d : i32 + } + emitc.assign %result : i32 to %0 : !emitc.lvalue<i32> + emitc.return +} + +// CPP-DEFAULT: void do_not_inline_side_effects_into_assign(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2:v[0-9]+]]) { +// CPP-DEFAULT-NEXT: int64_t [[VAL_3:v[0-9]+]] = 0; +// CPP-DEFAULT-NEXT: int32_t [[VAL_4:v[0-9]+]] = 42; +// CPP-DEFAULT-NEXT: int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]] * [[VAL_1]]; +// CPP-DEFAULT-NEXT: [[VAL_2]][[[VAL_3]]] = [[VAL_5]]; +// CPP-DEFAULT-NEXT: return; +// CPP-DEFAULT-NEXT: } + +// CPP-DECLTOP: void do_not_inline_side_effects_into_assign(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2:v[0-9]+]]) { +// CPP-DECLTOP-NEXT: int64_t [[VAL_3:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_4:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_5:v[0-9]+]]; +// CPP-DECLTOP-NEXT: [[VAL_3]] = 0; +// CPP-DECLTOP-NEXT: [[VAL_4]] = 42; +// CPP-DECLTOP-NEXT: [[VAL_5:v[0-9]+]] = [[VAL_4]] * [[VAL_1]]; +// CPP-DECLTOP-NEXT: [[VAL_2]][[[VAL_3]]] = [[VAL_5]]; +// CPP-DECLTOP-NEXT: return; +// CPP-DECLTOP-NEXT: } + +emitc.func @do_not_inline_side_effects_into_assign(%arg0: i32, %arg1: !emitc.ptr<i32>) { + %c0 = "emitc.constant"() {value = 0 : i64} : () -> i64 + %0 = "emitc.variable"() <{value = #emitc.opaque<"42">}> : () -> !emitc.lvalue<i32> + %ptr = emitc.subscript %arg1[%c0] : (!emitc.ptr<i32>, i64) -> !emitc.lvalue<i32> + %result = emitc.expression %arg0, %0 : (i32, !emitc.lvalue<i32>) -> i32 { + %a = emitc.load %0 : !emitc.lvalue<i32> + %b = emitc.mul %a, %arg0 : (i32, i32) -> i32 + yield %b : i32 + } + emitc.assign %result : i32 to %ptr : !emitc.lvalue<i32> + emitc.return +} + +// CPP-DEFAULT: int32_t do_not_inline_non_preceding_side_effects(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2:v[0-9]+]]) { +// CPP-DEFAULT-NEXT: int64_t [[VAL_3:v[0-9]+]] = 0; +// CPP-DEFAULT-NEXT: int32_t [[VAL_4:v[0-9]+]] = 42; +// CPP-DEFAULT-NEXT: int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]] * [[VAL_1]]; +// CPP-DEFAULT-NEXT: [[VAL_2]][[[VAL_3]]] = [[VAL_1]]; +// CPP-DEFAULT-NEXT: return [[VAL_5]]; +// CPP-DEFAULT-NEXT: } + +// CPP-DECLTOP: int32_t do_not_inline_non_preceding_side_effects(int32_t [[VAL_1:v[0-9]+]], int32_t* [[VAL_2:v[0-9]+]]) { +// CPP-DECLTOP-NEXT: int64_t [[VAL_3:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_4:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_5:v[0-9]+]]; +// CPP-DECLTOP-NEXT: [[VAL_3:v[0-9]+]] = 0; +// CPP-DECLTOP-NEXT: [[VAL_4:v[0-9]+]] = 42; +// CPP-DECLTOP-NEXT: [[VAL_5:v[0-9]+]] = [[VAL_4]] * [[VAL_1]]; +// CPP-DECLTOP-NEXT: [[VAL_2]][[[VAL_3]]] = [[VAL_1]]; +// CPP-DECLTOP-NEXT: return [[VAL_5]]; +// CPP-DECLTOP-NEXT: } + +emitc.func @do_not_inline_non_preceding_side_effects(%arg0: i32, %arg1: !emitc.ptr<i32>) -> i32 { + %c0 = "emitc.constant"() {value = 0 : i64} : () -> i64 + %0 = "emitc.variable"() <{value = #emitc.opaque<"42">}> : () -> !emitc.lvalue<i32> + %ptr = emitc.subscript %arg1[%c0] : (!emitc.ptr<i32>, i64) -> !emitc.lvalue<i32> + %result = emitc.expression %arg0, %0 : (i32, !emitc.lvalue<i32>) -> i32 { + %a = emitc.load %0 : !emitc.lvalue<i32> + %b = emitc.mul %a, %arg0 : (i32, i32) -> i32 + yield %b : i32 + } + emitc.assign %arg0 : i32 to %ptr : !emitc.lvalue<i32> + emitc.return %result : i32 +} + +// CPP-DEFAULT: int32_t inline_side_effects_into_if(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DEFAULT-NEXT: int32_t [[VAL_4:v[0-9]+]]; +// CPP-DEFAULT-NEXT: if (bar([[VAL_1]], [[VAL_2]]) < [[VAL_3]]) { +// CPP-DEFAULT-NEXT: [[VAL_4]] = [[VAL_1]]; +// CPP-DEFAULT-NEXT: } else { +// CPP-DEFAULT-NEXT: [[VAL_4]] = [[VAL_2]]; +// CPP-DEFAULT-NEXT: } +// CPP-DEFAULT-NEXT: int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]]; +// CPP-DEFAULT-NEXT: return [[VAL_5]]; +// CPP-DEFAULT-NEXT: } + +// CPP-DECLTOP: int32_t inline_side_effects_into_if(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DECLTOP-NEXT: int32_t [[VAL_4:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_5:v[0-9]+]]; +// CPP-DECLTOP-NEXT: ; +// CPP-DECLTOP-NEXT: if (bar([[VAL_1]], [[VAL_2]]) < [[VAL_3]]) { +// CPP-DECLTOP-NEXT: [[VAL_4]] = [[VAL_1]]; +// CPP-DECLTOP-NEXT: } else { +// CPP-DECLTOP-NEXT: [[VAL_4]] = [[VAL_2]]; +// CPP-DECLTOP-NEXT: } +// CPP-DECLTOP-NEXT: [[VAL_5]] = [[VAL_4]]; +// CPP-DECLTOP-NEXT: return [[VAL_5]]; +// CPP-DECLTOP-NEXT: } + +func.func @inline_side_effects_into_if(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 { + %v = "emitc.variable"(){value = #emitc.opaque<"">} : () -> !emitc.lvalue<i32> + %cond = emitc.expression %arg0, %arg1, %arg2 : (i32, i32, i32) -> i1 { + %a = emitc.call_opaque "bar" (%arg0, %arg1) : (i32, i32) -> (i32) + %b = emitc.cmp lt, %a, %arg2 :(i32, i32) -> i1 + emitc.yield %b : i1 + } + emitc.if %cond { + emitc.assign %arg0 : i32 to %v : !emitc.lvalue<i32> + emitc.yield + } else { + emitc.assign %arg1 : i32 to %v : !emitc.lvalue<i32> + emitc.yield + } + %v_load = emitc.load %v : !emitc.lvalue<i32> + return %v_load : i32 +} + +// CPP-DEFAULT: void inline_side_effects_into_switch(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DEFAULT-NEXT: switch (bar([[VAL_1]], [[VAL_2]]) + [[VAL_3]]) { +// CPP-DEFAULT-NEXT: case 2: { +// CPP-DEFAULT-NEXT: int32_t [[VAL_4:v[0-9]+]] = func_b(); +// CPP-DEFAULT-NEXT: break; +// CPP-DEFAULT-NEXT: } +// CPP-DEFAULT-NEXT: case 5: { +// CPP-DEFAULT-NEXT: int32_t [[VAL_5:v[0-9]+]] = func_a(); +// CPP-DEFAULT-NEXT: break; +// CPP-DEFAULT-NEXT: } +// CPP-DEFAULT-NEXT: default: { +// CPP-DEFAULT-NEXT: float [[VAL_6:v[0-9]+]] = 4.200000000e+01f; +// CPP-DEFAULT-NEXT: func2([[VAL_6]]); +// CPP-DEFAULT-NEXT: break; +// CPP-DEFAULT-NEXT: } +// CPP-DEFAULT-NEXT: } +// CPP-DEFAULT-NEXT: return; +// CPP-DEFAULT-NEXT: } + +// CPP-DECLTOP: void inline_side_effects_into_switch(int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]], int32_t [[VAL_3:v[0-9]+]]) { +// CPP-DECLTOP-NEXT: float [[VAL_6:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_4:v[0-9]+]]; +// CPP-DECLTOP-NEXT: int32_t [[VAL_5:v[0-9]+]]; +// CPP-DECLTOP-NEXT: switch (bar([[VAL_1]], [[VAL_2]]) + [[VAL_3]]) { +// CPP-DECLTOP-NEXT: case 2: { +// CPP-DECLTOP-NEXT: [[VAL_4]] = func_b(); +// CPP-DECLTOP-NEXT: break; +// CPP-DECLTOP-NEXT: } +// CPP-DECLTOP-NEXT: case 5: { +// CPP-DECLTOP-NEXT: [[VAL_5]] = func_a(); +// CPP-DECLTOP-NEXT: break; +// CPP-DECLTOP-NEXT: } +// CPP-DECLTOP-NEXT: default: { +// CPP-DECLTOP-NEXT: [[VAL_6]] = 4.200000000e+01f; +// CPP-DECLTOP-NEXT: func2([[VAL_6]]); +// CPP-DECLTOP-NEXT: break; +// CPP-DECLTOP-NEXT: } +// CPP-DECLTOP-NEXT: } +// CPP-DECLTOP-NEXT: return; +// CPP-DECLTOP-NEXT: } + +func.func @inline_side_effects_into_switch(%arg0: i32, %arg1: i32, %arg2: i32) { + %0 = emitc.expression %arg0, %arg1, %arg2 : (i32, i32, i32) -> i32 { + %a = emitc.call_opaque "bar" (%arg0, %arg1) : (i32, i32) -> (i32) + %b = emitc.add %a, %arg2 :(i32, i32) -> i32 + emitc.yield %b : i32 + } + emitc.switch %0 : i32 + case 2 { + %1 = emitc.call_opaque "func_b" () : () -> i32 + emitc.yield + } + case 5 { + %2 = emitc.call_opaque "func_a" () : () -> i32 + emitc.yield + } + default { + %3 = "emitc.constant"(){value = 42.0 : f32} : () -> f32 + emitc.call_opaque "func2" (%3) : (f32) -> () + emitc.yield + } + return +} diff --git a/mlir/test/python/rewrite.py b/mlir/test/python/rewrite.py index acf7db2..821e470 100644 --- a/mlir/test/python/rewrite.py +++ b/mlir/test/python/rewrite.py @@ -17,15 +17,16 @@ def run(f): def testRewritePattern(): def to_muli(op, rewriter): with rewriter.ip: - new_op = arith.muli(op.operands[0], op.operands[1], loc=op.location) + assert isinstance(op, arith.AddIOp) + new_op = arith.muli(op.lhs, op.rhs, loc=op.location) rewriter.replace_op(op, new_op.owner) def constant_1_to_2(op, rewriter): - c = op.attributes["value"].value + c = op.value.value if c != 1: return True # failed to match with rewriter.ip: - new_op = arith.constant(op.result.type, 2, loc=op.location) + new_op = arith.constant(op.type, 2, loc=op.location) rewriter.replace_op(op, [new_op]) with Context(): diff --git a/mlir/tools/mlir-tblgen/CppGenUtilities.cpp b/mlir/tools/mlir-tblgen/CppGenUtilities.cpp index fddd779..7cead35 100644 --- a/mlir/tools/mlir-tblgen/CppGenUtilities.cpp +++ b/mlir/tools/mlir-tblgen/CppGenUtilities.cpp @@ -18,8 +18,6 @@ void mlir::tblgen::emitSummaryAndDescComments(llvm::raw_ostream &os, llvm::StringRef summary, llvm::StringRef description, bool terminateComment) { - - std::string comments = ""; StringRef trimmedSummary = summary.trim(); StringRef trimmedDesc = description.trim(); raw_indented_ostream ros(os); diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp index 4b4ac41..f7134ce 100644 --- a/mlir/tools/mlir-tblgen/PassGen.cpp +++ b/mlir/tools/mlir-tblgen/PassGen.cpp @@ -195,7 +195,7 @@ public: } ::llvm::StringRef getArgument() const override { return "{2}"; } - ::llvm::StringRef getDescription() const override { return "{3}"; } + ::llvm::StringRef getDescription() const override { return R"PD({3})PD"; } /// Returns the derived pass name. static constexpr ::llvm::StringLiteral getPassName() { @@ -271,9 +271,9 @@ static void emitPassOptionDecls(const Pass &pass, raw_ostream &os) { os.indent(2) << "::mlir::Pass::" << (opt.isListOption() ? "ListOption" : "Option"); - os << formatv(R"(<{0}> {1}{{*this, "{2}", ::llvm::cl::desc("{3}"))", + os << formatv(R"(<{0}> {1}{{*this, "{2}", ::llvm::cl::desc(R"PO({3})PO"))", opt.getType(), opt.getCppVariableName(), opt.getArgument(), - opt.getDescription()); + opt.getDescription().trim()); if (std::optional<StringRef> defaultVal = opt.getDefaultValue()) os << ", ::llvm::cl::init(" << defaultVal << ")"; if (std::optional<StringRef> additionalFlags = opt.getAdditionalFlags()) @@ -285,9 +285,10 @@ static void emitPassOptionDecls(const Pass &pass, raw_ostream &os) { /// Emit the declarations for each of the pass statistics. static void emitPassStatisticDecls(const Pass &pass, raw_ostream &os) { for (const PassStatistic &stat : pass.getStatistics()) { - os << formatv(" ::mlir::Pass::Statistic {0}{{this, \"{1}\", \"{2}\"};\n", - stat.getCppVariableName(), stat.getName(), - stat.getDescription()); + os << formatv( + " ::mlir::Pass::Statistic {0}{{this, \"{1}\", R\"PS({2})PS\"};\n", + stat.getCppVariableName(), stat.getName(), + stat.getDescription().trim()); } } @@ -320,7 +321,7 @@ static void emitPassDefs(const Pass &pass, raw_ostream &os) { os << "namespace impl {\n"; os << formatv(baseClassBegin, passName, pass.getBaseClass(), - pass.getArgument(), pass.getSummary(), + pass.getArgument(), pass.getSummary().trim(), dependentDialectRegistrations); if (ArrayRef<PassOption> options = pass.getOptions(); !options.empty()) { @@ -393,7 +394,7 @@ public: } ::llvm::StringRef getArgument() const override { return "{2}"; } - ::llvm::StringRef getDescription() const override { return "{3}"; } + ::llvm::StringRef getDescription() const override { return R"PD({3})PD"; } /// Returns the derived pass name. static constexpr ::llvm::StringLiteral getPassName() { @@ -439,7 +440,7 @@ static void emitOldPassDecl(const Pass &pass, raw_ostream &os) { "\n "); } os << formatv(oldPassDeclBegin, defName, pass.getBaseClass(), - pass.getArgument(), pass.getSummary(), + pass.getArgument(), pass.getSummary().trim(), dependentDialectRegistrations); emitPassOptionDecls(pass, os); emitPassStatisticDecls(pass, os); diff --git a/mlir/unittests/IR/RemarkTest.cpp b/mlir/unittests/IR/RemarkTest.cpp index 5bfca25..bcbda90 100644 --- a/mlir/unittests/IR/RemarkTest.cpp +++ b/mlir/unittests/IR/RemarkTest.cpp @@ -149,7 +149,6 @@ TEST(Remark, TestNoOutputOptimizationRemark) { std::string categoryFailName("myImportantCategory"); std::string myPassname1("myPass1"); - std::string funcName("myFunc"); SmallString<64> tmpPathStorage; sys::fs::createUniquePath("remarks-%%%%%%.yaml", tmpPathStorage, /*MakeAbsolute=*/true); @@ -271,9 +270,6 @@ TEST(Remark, TestCustomOptimizationRemarkDiagnostic) { std::string categoryInline("Inliner"); std::string myPassname1("myPass1"); std::string myPassname2("myPass2"); - std::string funcName("myFunc"); - - std::string seenMsg = ""; { MLIRContext context; diff --git a/mlir/unittests/TableGen/passes.td b/mlir/unittests/TableGen/passes.td index 5e53cb9..79c57a9 100644 --- a/mlir/unittests/TableGen/passes.td +++ b/mlir/unittests/TableGen/passes.td @@ -19,8 +19,11 @@ def TestPassWithOptions : Pass<"test"> { let options = [ Option<"testOption", "testOption", "int", "0", "Test option">, - ListOption<"testListOption", "test-list-option", "int64_t", - "Test list option"> + // Testing the output of multi-line description. This would fail compilation + // if not properly handled. + ListOption<"testListOption", "test-list-option", "int64_t", [{ + Test + list option}]> ]; } diff --git a/openmp/runtime/src/z_AIX_asm.S b/openmp/runtime/src/z_AIX_asm.S index d711fcb..4352079 100644 --- a/openmp/runtime/src/z_AIX_asm.S +++ b/openmp/runtime/src/z_AIX_asm.S @@ -367,7 +367,7 @@ .vbyte 4, 0x00000000 # Traceback table begin .byte 0x00 # Version = 0 .byte 0x09 # Language = CPlusPlus - .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue + .byte 0x20 # -IsGlobalLinkage, -IsOutOfLineEpilogOrPrologue # +HasTraceBackTableOffset, -IsInternalProcedure # -HasControlledStorage, -IsTOCless # -IsFloatingPointPresent diff --git a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h index 0e516ee..c0d11e9 100644 --- a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h +++ b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h @@ -31,13 +31,13 @@ namespace orc_rt { /// /// Intances can: /// 1. Reserve address space. -/// 2. Finalize memory regions within reserved memory (copying content, +/// 2. Initialize memory regions within reserved memory (copying content, /// applying permissions, running finalize actions, and recording /// deallocate actions). -/// 3. Deallocate memory regions within reserved memory (running +/// 3. Deinitialize memory regions within reserved memory (running /// deallocate actions and making memory available for future -/// finalize calls (if the system permits this). -/// 4. Release address space, deallocating any not-yet-deallocated finalized +/// initialize calls (if the system permits this). +/// 4. Release address space, deinitializing any remaining initialized /// regions, and returning the address space to the system for reuse (if /// the system permits). class SimpleNativeMemoryMap : public ResourceManager { @@ -58,7 +58,7 @@ public: void releaseMultiple(OnReleaseCompleteFn &&OnComplete, std::vector<void *> Addrs); - struct FinalizeRequest { + struct InitializeRequest { struct Segment { AllocGroup AG; char *Address = nullptr; @@ -72,19 +72,19 @@ public: /// Writes content into the requested ranges, applies permissions, and /// performs allocation actions. - using OnFinalizeCompleteFn = move_only_function<void(Expected<void *>)>; - void finalize(OnFinalizeCompleteFn &&OnComplete, FinalizeRequest FR); + using OnInitializeCompleteFn = move_only_function<void(Expected<void *>)>; + void initialize(OnInitializeCompleteFn &&OnComplete, InitializeRequest FR); /// Runs deallocation actions and resets memory permissions for the requested /// memory. - using OnDeallocateCompleteFn = move_only_function<void(Error)>; - void deallocate(OnDeallocateCompleteFn &&OnComplete, void *Base); + using OnDeinitializeCompleteFn = move_only_function<void(Error)>; + void deinitialize(OnDeinitializeCompleteFn &&OnComplete, void *Base); - /// Convenience method to deallocate multiple regions with one call. This can - /// be used to save on interprocess communication at the cost of less + /// Convenience method to deinitialize multiple regions with one call. This + /// can be used to save on interprocess communication at the cost of less /// expressive errors. - void deallocateMultiple(OnDeallocateCompleteFn &&OnComplete, - std::vector<void *> Bases); + void deinitializeMultiple(OnDeinitializeCompleteFn &&OnComplete, + std::vector<void *> Bases); void detach(ResourceManager::OnCompleteFn OnComplete) override; void shutdown(ResourceManager::OnCompleteFn OnComplete) override; @@ -98,8 +98,9 @@ private: void releaseNext(OnReleaseCompleteFn &&OnComplete, std::vector<void *> Addrs, bool AnyError, Error LastErr); - void deallocateNext(OnDeallocateCompleteFn &&OnComplete, - std::vector<void *> Bases, bool AnyError, Error LastErr); + void deinitializeNext(OnDeinitializeCompleteFn &&OnComplete, + std::vector<void *> Bases, bool AnyError, + Error LastErr); void shutdownNext(OnCompleteFn OnComplete, std::vector<void *> Bases); Error makeBadSlabError(void *Base, const char *Op); SlabInfo *findSlabInfoFor(void *Base); @@ -121,12 +122,12 @@ orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper( orc_rt_SessionRef Session, void *CallCtx, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); -ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_finalize_sps_wrapper( +ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper( orc_rt_SessionRef Session, void *CallCtx, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); ORC_RT_SPS_INTERFACE void -orc_rt_SimpleNativeMemoryMap_deallocateMultiple_sps_wrapper( +orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper( orc_rt_SessionRef Session, void *CallCtx, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); diff --git a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp index 987bd85..bce5c1da 100644 --- a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp +++ b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp @@ -8,8 +8,8 @@ // // SimpleNativeMemoryMap and related APIs. // -// TODO: We don't reset / uncommit pages on deallocate, or on failure during -// finalize. We should do that to reduce memory pressure. +// TODO: We don't reset / uncommit pages on deinitialize, or on failure during +// initialize. We should do that to reduce memory pressure. // //===----------------------------------------------------------------------===// @@ -29,14 +29,16 @@ namespace orc_rt { struct SPSSimpleNativeMemoryMapSegment; template <> -class SPSSerializationTraits<SPSSimpleNativeMemoryMapSegment, - SimpleNativeMemoryMap::FinalizeRequest::Segment> { +class SPSSerializationTraits< + SPSSimpleNativeMemoryMapSegment, + SimpleNativeMemoryMap::InitializeRequest::Segment> { using SPSType = SPSTuple<SPSAllocGroup, SPSExecutorAddr, uint64_t, SPSSequence<char>>; public: - static bool deserialize(SPSInputBuffer &IB, - SimpleNativeMemoryMap::FinalizeRequest::Segment &S) { + static bool + deserialize(SPSInputBuffer &IB, + SimpleNativeMemoryMap::InitializeRequest::Segment &S) { AllocGroup AG; ExecutorAddr Address; uint64_t Size; @@ -50,17 +52,17 @@ public: } }; -struct SPSSimpleNativeMemoryMapFinalizeRequest; +struct SPSSimpleNativeMemoryMapInitializeRequest; template <> -class SPSSerializationTraits<SPSSimpleNativeMemoryMapFinalizeRequest, - SimpleNativeMemoryMap::FinalizeRequest> { +class SPSSerializationTraits<SPSSimpleNativeMemoryMapInitializeRequest, + SimpleNativeMemoryMap::InitializeRequest> { using SPSType = SPSTuple<SPSSequence<SPSSimpleNativeMemoryMapSegment>, SPSSequence<SPSAllocActionPair>>; public: static bool deserialize(SPSInputBuffer &IB, - SimpleNativeMemoryMap::FinalizeRequest &FR) { + SimpleNativeMemoryMap::InitializeRequest &FR) { return SPSType::AsArgList::deserialize(IB, FR.Segments, FR.AAPs); } }; @@ -121,13 +123,13 @@ void SimpleNativeMemoryMap::releaseMultiple(OnReleaseCompleteFn &&OnComplete, releaseNext(std::move(OnComplete), std::move(Addrs), false, Error::success()); } -void SimpleNativeMemoryMap::finalize(OnFinalizeCompleteFn &&OnComplete, - FinalizeRequest FR) { +void SimpleNativeMemoryMap::initialize(OnInitializeCompleteFn &&OnComplete, + InitializeRequest FR) { void *Base = nullptr; - // TODO: Record finalize segments for release. - // std::vector<std::pair<void*, size_t>> FinalizeSegments; + // TODO: Record initialize segments for release. + // std::vector<std::pair<void*, size_t>> InitializeSegments; // Check segment validity before proceeding. for (auto &S : FR.Segments) { @@ -166,9 +168,10 @@ void SimpleNativeMemoryMap::finalize(OnFinalizeCompleteFn &&OnComplete, } if (!Base) - return OnComplete(make_error<StringError>( - "SimpleNativeMemoryMap finalize error: finalization requires at least " - "one standard-lifetime segment")); + return OnComplete( + make_error<StringError>("SimpleNativeMemoryMap initialize error: " + "finalization requires at least " + "one standard-lifetime segment")); auto DeallocActions = runFinalizeActions(std::move(FR.AAPs)); if (!DeallocActions) @@ -182,8 +185,8 @@ void SimpleNativeMemoryMap::finalize(OnFinalizeCompleteFn &&OnComplete, OnComplete(Base); } -void SimpleNativeMemoryMap::deallocate(OnDeallocateCompleteFn &&OnComplete, - void *Base) { +void SimpleNativeMemoryMap::deinitialize(OnDeinitializeCompleteFn &&OnComplete, + void *Base) { std::vector<AllocAction> DAAs; { @@ -191,16 +194,17 @@ void SimpleNativeMemoryMap::deallocate(OnDeallocateCompleteFn &&OnComplete, auto *SI = findSlabInfoFor(Base); if (!SI) { Lock.unlock(); - return OnComplete(makeBadSlabError(Base, "finalize")); + return OnComplete(makeBadSlabError(Base, "deinitialize")); } auto I = SI->DeallocActions.find(Base); if (I == SI->DeallocActions.end()) { Lock.unlock(); std::ostringstream ErrMsg; - ErrMsg << "SimpleNativeMemoryMap deallocate error: no deallocate actions " - "registered for segment base address " - << Base; + ErrMsg + << "SimpleNativeMemoryMap deinitialize error: no deallocate actions " + "registered for segment base address " + << Base; return OnComplete(make_error<StringError>(ErrMsg.str())); } @@ -212,10 +216,10 @@ void SimpleNativeMemoryMap::deallocate(OnDeallocateCompleteFn &&OnComplete, OnComplete(Error::success()); } -void SimpleNativeMemoryMap::deallocateMultiple( - OnDeallocateCompleteFn &&OnComplete, std::vector<void *> Bases) { - deallocateNext(std::move(OnComplete), std::move(Bases), false, - Error::success()); +void SimpleNativeMemoryMap::deinitializeMultiple( + OnDeinitializeCompleteFn &&OnComplete, std::vector<void *> Bases) { + deinitializeNext(std::move(OnComplete), std::move(Bases), false, + Error::success()); } void SimpleNativeMemoryMap::detach(ResourceManager::OnCompleteFn OnComplete) { @@ -268,9 +272,9 @@ void SimpleNativeMemoryMap::releaseNext(OnReleaseCompleteFn &&OnComplete, NextAddr); } -void SimpleNativeMemoryMap::deallocateNext(OnDeallocateCompleteFn &&OnComplete, - std::vector<void *> Addrs, - bool AnyError, Error LastErr) { +void SimpleNativeMemoryMap::deinitializeNext( + OnDeinitializeCompleteFn &&OnComplete, std::vector<void *> Addrs, + bool AnyError, Error LastErr) { // TODO: Log error? if (LastErr) { consumeError(std::move(LastErr)); @@ -282,17 +286,17 @@ void SimpleNativeMemoryMap::deallocateNext(OnDeallocateCompleteFn &&OnComplete, return OnComplete(Error::success()); return OnComplete( - make_error<StringError>("Failed to deallocate some addresses")); + make_error<StringError>("Failed to deinitialize some addresses")); } void *NextAddr = Addrs.back(); Addrs.pop_back(); - deallocate( + deinitialize( [this, OnComplete = std::move(OnComplete), AnyError = AnyError, Addrs = std::move(Addrs)](Error Err) mutable { - deallocateNext(std::move(OnComplete), std::move(Addrs), AnyError, - std::move(Err)); + deinitializeNext(std::move(OnComplete), std::move(Addrs), AnyError, + std::move(Err)); }, NextAddr); } @@ -346,15 +350,15 @@ Error SimpleNativeMemoryMap::recordDeallocActions( auto *SI = findSlabInfoFor(Base); if (!SI) { Lock.unlock(); - return makeBadSlabError(Base, "deallocate"); + return makeBadSlabError(Base, "deinitialize"); } auto I = SI->DeallocActions.find(Base); if (I != SI->DeallocActions.end()) { Lock.unlock(); std::ostringstream ErrMsg; - ErrMsg << "SimpleNativeMemoryMap finalize error: segment base address " - "reused in subsequent finalize call"; + ErrMsg << "SimpleNativeMemoryMap initialize error: segment base address " + "reused in subsequent initialize call"; return make_error<StringError>(ErrMsg.str()); } @@ -383,19 +387,19 @@ orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper( &SimpleNativeMemoryMap::releaseMultiple)); } -ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_finalize_sps_wrapper( +ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper( orc_rt_SessionRef Session, void *CallCtx, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { using Sig = SPSExpected<SPSExecutorAddr>( - SPSExecutorAddr, SPSSimpleNativeMemoryMapFinalizeRequest); - SPSWrapperFunction<Sig>::handle( - Session, CallCtx, Return, ArgBytes, - WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::finalize)); + SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest); + SPSWrapperFunction<Sig>::handle(Session, CallCtx, Return, ArgBytes, + WrapperFunction::handleWithAsyncMethod( + &SimpleNativeMemoryMap::initialize)); } ORC_RT_SPS_INTERFACE void -orc_rt_SimpleNativeMemoryMap_deallocateMultiple_sps_wrapper( +orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper( orc_rt_SessionRef Session, void *CallCtx, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { @@ -403,7 +407,7 @@ orc_rt_SimpleNativeMemoryMap_deallocateMultiple_sps_wrapper( SPSWrapperFunction<Sig>::handle( Session, CallCtx, Return, ArgBytes, WrapperFunction::handleWithAsyncMethod( - &SimpleNativeMemoryMap::deallocateMultiple)); + &SimpleNativeMemoryMap::deinitializeMultiple)); } } // namespace orc_rt diff --git a/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp b/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp index c54d791..c793886 100644 --- a/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp +++ b/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp @@ -26,14 +26,14 @@ namespace orc_rt { struct SPSSimpleNativeMemoryMapSegment; -/// A SimpleNativeMemoryMap::FinalizeRequest::Segment plus segment content (if +/// A SimpleNativeMemoryMap::InitializeRequest::Segment plus segment content (if /// segment content type is regular). struct TestSNMMSegment - : public SimpleNativeMemoryMap::FinalizeRequest::Segment { + : public SimpleNativeMemoryMap::InitializeRequest::Segment { TestSNMMSegment(AllocGroup AG, char *Address, size_t Size, std::vector<char> C = {}) - : SimpleNativeMemoryMap::FinalizeRequest::Segment( + : SimpleNativeMemoryMap::InitializeRequest::Segment( {AG, Address, Size, {}}), OwnedContent(std::move(C)) { this->Content = {OwnedContent.data(), OwnedContent.size()}; @@ -60,25 +60,25 @@ public: } }; -struct SPSSimpleNativeMemoryMapFinalizeRequest; +struct SPSSimpleNativeMemoryMapInitializeRequest; -struct TestSNMMFinalizeRequest { +struct TestSNMMInitializeRequest { std::vector<TestSNMMSegment> Segments; std::vector<AllocActionPair> AAPs; }; template <> -class SPSSerializationTraits<SPSSimpleNativeMemoryMapFinalizeRequest, - TestSNMMFinalizeRequest> { +class SPSSerializationTraits<SPSSimpleNativeMemoryMapInitializeRequest, + TestSNMMInitializeRequest> { using SPSType = SPSTuple<SPSSequence<SPSSimpleNativeMemoryMapSegment>, SPSSequence<SPSAllocActionPair>>; public: - static size_t size(const TestSNMMFinalizeRequest &FR) { + static size_t size(const TestSNMMInitializeRequest &FR) { return SPSType::AsArgList::size(FR.Segments, FR.AAPs); } static bool serialize(SPSOutputBuffer &OB, - const TestSNMMFinalizeRequest &FR) { + const TestSNMMInitializeRequest &FR) { return SPSType::AsArgList::serialize(OB, FR.Segments, FR.AAPs); } }; @@ -118,24 +118,26 @@ static void snmm_releaseMultiple(OnCompleteFn &&OnComplete, } template <typename OnCompleteFn> -static void snmm_finalize(OnCompleteFn &&OnComplete, - SimpleNativeMemoryMap *Instance, - TestSNMMFinalizeRequest FR) { +static void snmm_initialize(OnCompleteFn &&OnComplete, + SimpleNativeMemoryMap *Instance, + TestSNMMInitializeRequest FR) { using SPSSig = SPSExpected<SPSExecutorAddr>( - SPSExecutorAddr, SPSSimpleNativeMemoryMapFinalizeRequest); + SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest); SPSWrapperFunction<SPSSig>::call( - DirectCaller(nullptr, orc_rt_SimpleNativeMemoryMap_finalize_sps_wrapper), + DirectCaller(nullptr, + orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper), std::forward<OnCompleteFn>(OnComplete), Instance, std::move(FR)); } template <typename OnCompleteFn> -static void snmm_deallocateMultiple(OnCompleteFn &&OnComplete, - SimpleNativeMemoryMap *Instance, - span<void *> Base) { +static void snmm_deinitializeMultiple(OnCompleteFn &&OnComplete, + SimpleNativeMemoryMap *Instance, + span<void *> Base) { using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>); SPSWrapperFunction<SPSSig>::call( - DirectCaller(nullptr, - orc_rt_SimpleNativeMemoryMap_deallocateMultiple_sps_wrapper), + DirectCaller( + nullptr, + orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper), std::forward<OnCompleteFn>(OnComplete), Instance, Base); } @@ -180,9 +182,9 @@ read_value_sps_allocaction(const char *ArgData, size_t ArgSize) { TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) { // Test that we can: // 1. reserve some address space. - // 2. finalize a range within it as read/write, and that finalize actions + // 2. initialize a range within it as read/write, and that finalize actions // are applied as expected. - // 3. deallocate the finalized range, with deallocation actions applied as + // 3. deinitialize the initialized range, with deallocation actions applied as // expected. // 4. release the address range. @@ -191,12 +193,13 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) { snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024); void *Addr = cantFail(cantFail(ReserveAddr.get())); - std::future<Expected<Expected<void *>>> FinalizeKey; - TestSNMMFinalizeRequest FR; - char *FinalizeBase = // Finalize addr at non-zero (64kb) offset from base. + std::future<Expected<Expected<void *>>> InitializeKey; + TestSNMMInitializeRequest FR; + char *InitializeBase = // Initialize addr at non-zero (64kb) offset from base. reinterpret_cast<char *>(Addr) + 64 * 1024; uint64_t SentinelValue1 = 0; // Read from pre-filled content - uint64_t SentinelValue2 = 0; // Written in finalize, read back during dealloc. + uint64_t SentinelValue2 = + 0; // Written in initialize, read back during dealloc. uint64_t SentinelValue3 = 42; // Read from zero-filled region. // Build initial content vector. @@ -205,14 +208,14 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) { memcpy(Content.data(), &SentinelValue3, sizeof(uint64_t)); memcpy(Content.data() + sizeof(uint64_t), &SentinelValue1, sizeof(uint64_t)); - FR.Segments.push_back({MemProt::Read | MemProt::Write, FinalizeBase, + FR.Segments.push_back({MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024, std::move(Content)}); // Read initial content into Sentinel 1. FR.AAPs.push_back({ *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from( read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue1), - ExecutorAddr::fromPtr(FinalizeBase)), + ExecutorAddr::fromPtr(InitializeBase)), {} // No dealloc action. }); @@ -220,30 +223,30 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) { FR.AAPs.push_back( {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from( write_value_sps_allocaction, - ExecutorAddr::fromPtr(FinalizeBase) + sizeof(uint64_t), + ExecutorAddr::fromPtr(InitializeBase) + sizeof(uint64_t), uint64_t(42)), *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from( read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue2), - ExecutorAddr::fromPtr(FinalizeBase) + sizeof(uint64_t))}); + ExecutorAddr::fromPtr(InitializeBase) + sizeof(uint64_t))}); // Read first 64 bits of the zero-fill region. FR.AAPs.push_back({ *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from( read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue3), - ExecutorAddr::fromPtr(FinalizeBase) + sizeof(uint64_t) * 2), + ExecutorAddr::fromPtr(InitializeBase) + sizeof(uint64_t) * 2), {} // No dealloc action. }); - snmm_finalize(waitFor(FinalizeKey), SNMM.get(), std::move(FR)); - void *FinalizeKeyAddr = cantFail(cantFail(FinalizeKey.get())); + snmm_initialize(waitFor(InitializeKey), SNMM.get(), std::move(FR)); + void *InitializeKeyAddr = cantFail(cantFail(InitializeKey.get())); EXPECT_EQ(SentinelValue1, 42U); EXPECT_EQ(SentinelValue2, 0U); EXPECT_EQ(SentinelValue3, 0U); std::future<Expected<Error>> DeallocResult; - snmm_deallocateMultiple(waitFor(DeallocResult), SNMM.get(), - {&FinalizeKeyAddr, 1}); + snmm_deinitializeMultiple(waitFor(DeallocResult), SNMM.get(), + {&InitializeKeyAddr, 1}); cantFail(cantFail(DeallocResult.get())); EXPECT_EQ(SentinelValue1, 42U); @@ -255,33 +258,33 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) { cantFail(cantFail(ReleaseResult.get())); } -TEST(SimpleNativeMemoryMap, ReserveFinalizeShutdown) { - // Test that memory is deallocated in the case where we reserve and finalize - // some memory, then just shut down the memory manager. +TEST(SimpleNativeMemoryMap, ReserveInitializeShutdown) { + // Test that memory is deinitialized in the case where we reserve and + // initialize some memory, then just shut down the memory manager. auto SNMM = std::make_unique<SimpleNativeMemoryMap>(); std::future<Expected<Expected<void *>>> ReserveAddr; snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024); void *Addr = cantFail(cantFail(ReserveAddr.get())); - std::future<Expected<Expected<void *>>> FinalizeKey; - TestSNMMFinalizeRequest FR; - char *FinalizeBase = // Finalize addr at non-zero (64kb) offset from base. + std::future<Expected<Expected<void *>>> InitializeKey; + TestSNMMInitializeRequest FR; + char *InitializeBase = // Initialize addr at non-zero (64kb) offset from base. reinterpret_cast<char *>(Addr) + 64 * 1024; uint64_t SentinelValue = 0; FR.Segments.push_back( - {MemProt::Read | MemProt::Write, FinalizeBase, 64 * 1024}); + {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024}); FR.AAPs.push_back( {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from( - write_value_sps_allocaction, ExecutorAddr::fromPtr(FinalizeBase), + write_value_sps_allocaction, ExecutorAddr::fromPtr(InitializeBase), uint64_t(42)), *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from( read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue), - ExecutorAddr::fromPtr(FinalizeBase))}); - snmm_finalize(waitFor(FinalizeKey), SNMM.get(), std::move(FR)); - cantFail(cantFail(FinalizeKey.get())); + ExecutorAddr::fromPtr(InitializeBase))}); + snmm_initialize(waitFor(InitializeKey), SNMM.get(), std::move(FR)); + cantFail(cantFail(InitializeKey.get())); EXPECT_EQ(SentinelValue, 0U); @@ -292,33 +295,33 @@ TEST(SimpleNativeMemoryMap, ReserveFinalizeShutdown) { EXPECT_EQ(SentinelValue, 42); } -TEST(SimpleNativeMemoryMap, ReserveFinalizeDetachShutdown) { - // Test that memory is deallocated in the case where we reserve and finalize - // some memory, then just shut down the memory manager. +TEST(SimpleNativeMemoryMap, ReserveInitializeDetachShutdown) { + // Test that memory is deinitialized in the case where we reserve and + // initialize some memory, then just shut down the memory manager. auto SNMM = std::make_unique<SimpleNativeMemoryMap>(); std::future<Expected<Expected<void *>>> ReserveAddr; snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024); void *Addr = cantFail(cantFail(ReserveAddr.get())); - std::future<Expected<Expected<void *>>> FinalizeKey; - TestSNMMFinalizeRequest FR; - char *FinalizeBase = // Finalize addr at non-zero (64kb) offset from base. + std::future<Expected<Expected<void *>>> InitializeKey; + TestSNMMInitializeRequest FR; + char *InitializeBase = // Initialize addr at non-zero (64kb) offset from base. reinterpret_cast<char *>(Addr) + 64 * 1024; uint64_t SentinelValue = 0; FR.Segments.push_back( - {MemProt::Read | MemProt::Write, FinalizeBase, 64 * 1024}); + {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024}); FR.AAPs.push_back( {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from( - write_value_sps_allocaction, ExecutorAddr::fromPtr(FinalizeBase), + write_value_sps_allocaction, ExecutorAddr::fromPtr(InitializeBase), uint64_t(42)), *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from( read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue), - ExecutorAddr::fromPtr(FinalizeBase))}); - snmm_finalize(waitFor(FinalizeKey), SNMM.get(), std::move(FR)); - cantFail(cantFail(FinalizeKey.get())); + ExecutorAddr::fromPtr(InitializeBase))}); + snmm_initialize(waitFor(InitializeKey), SNMM.get(), std::move(FR)); + cantFail(cantFail(InitializeKey.get())); EXPECT_EQ(SentinelValue, 0U); diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 74ba077..e61d6b2 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6309,6 +6309,7 @@ cc_library( ":GPUDialect", ":IR", ":InferIntRangeInterface", + ":InliningUtils", ":LLVMDialect", ":ROCDLOpsIncGen", ":SideEffectInterfaces", @@ -11369,6 +11370,7 @@ cc_library( ":VectorDialect", ":VectorTransforms", ":VectorUtils", + "//llvm:Support", ], ) @@ -11643,6 +11645,7 @@ cc_library( ":TensorUtils", ":TosaDialect", ":TransformUtils", + "//llvm:Support", ], ) |