diff options
Diffstat (limited to 'llvm/test/CodeGen')
96 files changed, 17112 insertions, 11613 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir new file mode 100644 index 0000000..824ada1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir @@ -0,0 +1,278 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.1: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6 + ; CHECK-NEXT: %1:_ KnownBits:00011000 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:00011010 SignBits:3 + %0:_(s8) = G_CONSTANT i8 2 + %1:_(s8) = G_CONSTANT i8 24 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: CstZero +body: | + bb.1: + ; CHECK-LABEL: name: @CstZero + ; CHECK-NEXT: %0:_ KnownBits:00000001 SignBits:7 + ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 1 + %1:_(s8) = G_CONSTANT i8 255 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: CstNegOne +body: | + bb.1: + ; CHECK-LABEL: name: @CstNegOne + ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8 + %0:_(s8) = G_CONSTANT i8 0 + %1:_(s8) = G_CONSTANT i8 255 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: CstSeven +body: | + bb.1: + ; CHECK-LABEL: name: @CstSeven + ; CHECK-NEXT: %0:_ KnownBits:00001000 SignBits:4 + ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000111 SignBits:5 + %0:_(s8) = G_CONSTANT i8 8 + %1:_(s8) = G_CONSTANT i8 255 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: CstNeg +body: | + bb.1: + ; CHECK-LABEL: name: @CstNeg + ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:11100010 SignBits:3 + %0:_(s8) = G_CONSTANT i8 224 + %1:_(s8) = G_CONSTANT i8 2 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: ScalarVar +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: ScalarRhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarRhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_ADD %0, %1 +... +--- +name: ScalarNonNegative +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarNonNegative + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %3:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 15 + %2:_(s8) = G_AND %0, %1 + %3:_(s8) = G_CONSTANT i8 255 + %4:_(s8) = G_ADD %2, %3 +... +--- +name: ScalarLhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarLhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_ADD %1, %0 +... +--- +name: ScalarPartKnown +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarPartKnown + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5 + ; CHECK-NEXT: %4:_ KnownBits:000????? SignBits:3 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 15 + %2:_(s8) = G_AND %0, %1 + %3:_(s8) = G_CONSTANT i8 5 + %4:_(s8) = G_ADD %2, %3 +... +--- +name: VectorCstZero +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCstZero + ; CHECK-NEXT: %0:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16 + ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16 + %0:_(s16) = G_CONSTANT i16 1 + %1:_(s16) = G_CONSTANT i16 65535 + %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %4:_(<4 x s16>) = G_ADD %2, %3 +... +--- +name: VectorCstNegOne +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCstNegOne + ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16 + ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16 + %0:_(s16) = G_CONSTANT i16 0 + %1:_(s16) = G_CONSTANT i16 65535 + %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %4:_(<4 x s16>) = G_ADD %2, %3 +... +--- +name: VectorVar +body: | + bb.1: + ; CHECK-LABEL: name: @VectorVar + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(<4 x s16>) = COPY $d1 + %2:_(<4 x s16>) = G_ADD %0, %1 +... +--- +name: VectorRhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @VectorRhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_ADD %2, %0 +... +--- +name: VectorNonNegative +body: | + bb.1: + ; CHECK-LABEL: name: @VectorNonNegative + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16 + ; CHECK-NEXT: %5:_ KnownBits:1111111111111111 SignBits:16 + ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 255 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_AND %0, %2 + %4:_(s16) = G_CONSTANT i16 65535 + %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4 + %6:_(<4 x s16>) = G_ADD %3, %5 +... +--- +name: VectorLhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @VectorLhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_ADD %0, %2 +... +--- +name: VectorPartKnown +body: | + bb.1: + ; CHECK-LABEL: name: @VectorPartKnown + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10 + ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9 + ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9 + ; CHECK-NEXT: %7:_ KnownBits:0000000????????? SignBits:7 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 255 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_AND %0, %2 + %4:_(s16) = G_CONSTANT i16 42 + %5:_(s16) = G_CONSTANT i16 74 + %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4 + %7:_(<4 x s16>) = G_ADD %6, %3 +... +--- +name: VectorCst36 +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst36 + ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13 + ; CHECK-NEXT: %4:_ KnownBits:000000000000???? SignBits:12 + %0:_(s16) = G_CONSTANT i16 3 + %1:_(s16) = G_CONSTANT i16 6 + %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0 + %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0 + %4:_(<4 x s16>) = G_ADD %2, %3 +... + +--- +name: VectorCst3unknown +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst3unknown + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = COPY $h0 + %2:_(s16) = G_CONSTANT i16 3 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 + %4:_(<4 x s16>) = G_ADD %0, %3 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir index 8552931..ee35447 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir @@ -102,8 +102,8 @@ body: | ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 %0:_(<4 x s16>) = COPY $d0 - %2:_(s16) = COPY $h0 - %1:_(s16) = G_CONSTANT i16 3 + %1:_(s16) = COPY $h0 + %2:_(s16) = G_CONSTANT i16 3 %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 %4:_(<4 x s16>) = G_ASHR %0, %3 ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir index 61d1c43..97bcb80 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir @@ -135,8 +135,8 @@ body: | ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 %0:_(<4 x s16>) = COPY $d0 - %2:_(s16) = COPY $h0 - %1:_(s16) = G_CONSTANT i16 3 + %1:_(s16) = COPY $h0 + %2:_(s16) = G_CONSTANT i16 3 %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 %4:_(<4 x s16>) = G_SHL %0, %3 ... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir new file mode 100644 index 0000000..332049d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir @@ -0,0 +1,276 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.1: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6 + ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:00100010 SignBits:2 + %0:_(s8) = G_CONSTANT i8 2 + %1:_(s8) = G_CONSTANT i8 224 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: CstZero +body: | + bb.1: + ; CHECK-LABEL: name: @CstZero + ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 0 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: CstNegOne +body: | + bb.1: + ; CHECK-LABEL: name: @CstNegOne + ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7 + ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8 + %0:_(s8) = G_CONSTANT i8 0 + %1:_(s8) = G_CONSTANT i8 1 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: CstNegFour +body: | + bb.1: + ; CHECK-LABEL: name: @CstNegFour + ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000100 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111100 SignBits:6 + %0:_(s8) = G_CONSTANT i8 0 + %1:_(s8) = G_CONSTANT i8 4 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: CstNeg +body: | + bb.1: + ; CHECK-LABEL: name: @CstNeg + ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:11011110 SignBits:2 + %0:_(s8) = G_CONSTANT i8 224 + %1:_(s8) = G_CONSTANT i8 2 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: ScalarVar +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: ScalarRhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarRhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_SUB %0, %1 +... +--- +name: ScalarNonNegative +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarNonNegative + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 15 + %2:_(s8) = G_AND %0, %1 + %3:_(s8) = G_CONSTANT i8 0 + %4:_(s8) = G_SUB %3, %2 +... +--- +name: ScalarLhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarLhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_SUB %1, %0 +... +--- +name: ScalarPartKnown +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarPartKnown + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:3 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 15 + %2:_(s8) = G_AND %0, %1 + %3:_(s8) = G_CONSTANT i8 5 + %4:_(s8) = G_SUB %2, %3 +... +--- +name: VectorCstZero +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCstZero + ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000000 SignBits:16 + %0:_(s16) = G_CONSTANT i16 0 + %1:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0 + %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0 + %3:_(<4 x s16>) = G_SUB %1, %2 +... +--- +name: VectorCstNegOne +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCstNegOne + ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16 + %0:_(s16) = G_CONSTANT i16 0 + %1:_(s16) = G_CONSTANT i16 1 + %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %4:_(<4 x s16>) = G_SUB %2, %3 +... +--- +name: VectorVar +body: | + bb.1: + ; CHECK-LABEL: name: @VectorVar + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(<4 x s16>) = COPY $d1 + %2:_(<4 x s16>) = G_SUB %0, %1 +... +--- +name: VectorRhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @VectorRhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_SUB %2, %0 +... +--- +name: VectorNonNegative +body: | + bb.1: + ; CHECK-LABEL: name: @VectorNonNegative + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %5:_ KnownBits:0000000000000000 SignBits:16 + ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 255 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_AND %0, %2 + %4:_(s16) = G_CONSTANT i16 0 + %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4 + %6:_(<4 x s16>) = G_SUB %5, %3 +... +--- +name: VectorLhsEarlyOut +body: | + bb.1: + ; CHECK-LABEL: name: @VectorLhsEarlyOut + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_SUB %0, %2 +... +--- +name: VectorPartKnown +body: | + bb.1: + ; CHECK-LABEL: name: @VectorPartKnown + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8 + ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8 + ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10 + ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9 + ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9 + ; CHECK-NEXT: %7:_ KnownBits:???????????????? SignBits:7 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 255 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_AND %0, %2 + %4:_(s16) = G_CONSTANT i16 42 + %5:_(s16) = G_CONSTANT i16 74 + %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4 + %7:_(<4 x s16>) = G_SUB %6, %3 +... +--- +name: VectorCst36 +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst36 + ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:12 + %0:_(s16) = G_CONSTANT i16 3 + %1:_(s16) = G_CONSTANT i16 6 + %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0 + %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0 + %4:_(<4 x s16>) = G_SUB %2, %3 +... + +--- +name: VectorCst3unknown +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst3unknown + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = COPY $h0 + %2:_(s16) = G_CONSTANT i16 3 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 + %4:_(<4 x s16>) = G_SUB %0, %3 +... diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll index aa070b7..9b456a5 100644 --- a/llvm/test/CodeGen/AArch64/adds_cmn.ll +++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll @@ -22,10 +22,8 @@ entry: define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) { ; CHECK-LABEL: adds_cmn_c: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmn w0, w1 -; CHECK-NEXT: add w1, w1, w0 -; CHECK-NEXT: cset w8, lo -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: adds w1, w0, w1 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y) diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll index dc88f94..cca190f 100644 --- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -1774,3 +1774,88 @@ define i128 @combine_i128_sdiv_const100(i128 %x) { %1 = sdiv i128 %x, 100 ret i128 %1 } + +; The following only becomes an sdiv_by_one after type legalisation, after which +; the splatted scalar constant has a different type to the splat vector. This +; test verifies DAGCombiner does not care about this type difference. +define <16 x i16> @combine_vec_sdiv_by_one_obfuscated(<16 x i16> %x) "target-features"="+sve" { +; CHECK-SD-LABEL: combine_vec_sdiv_by_one_obfuscated: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: combine_vec_sdiv_by_one_obfuscated: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0000000000000000 +; CHECK-GI-NEXT: movi v3.8h, #1 +; CHECK-GI-NEXT: smov w8, v0.h[0] +; CHECK-GI-NEXT: mov v3.h[0], v2.h[0] +; CHECK-GI-NEXT: smov w9, v3.h[0] +; CHECK-GI-NEXT: smov w16, v3.h[7] +; CHECK-GI-NEXT: sdiv w14, w8, w9 +; CHECK-GI-NEXT: smov w8, v0.h[1] +; CHECK-GI-NEXT: smov w9, v3.h[1] +; CHECK-GI-NEXT: sdiv w15, w8, w9 +; CHECK-GI-NEXT: smov w8, v0.h[2] +; CHECK-GI-NEXT: smov w9, v3.h[2] +; CHECK-GI-NEXT: sdiv w13, w8, w9 +; CHECK-GI-NEXT: smov w8, v0.h[3] +; CHECK-GI-NEXT: smov w9, v3.h[3] +; CHECK-GI-NEXT: sdiv w12, w8, w9 +; CHECK-GI-NEXT: smov w8, v0.h[4] +; CHECK-GI-NEXT: smov w9, v3.h[4] +; CHECK-GI-NEXT: sdiv w11, w8, w9 +; CHECK-GI-NEXT: smov w8, v0.h[5] +; CHECK-GI-NEXT: smov w9, v3.h[5] +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: smov w8, v0.h[6] +; CHECK-GI-NEXT: smov w9, v3.h[6] +; CHECK-GI-NEXT: movi v3.8h, #1 +; CHECK-GI-NEXT: smov w17, v3.h[0] +; CHECK-GI-NEXT: smov w18, v3.h[1] +; CHECK-GI-NEXT: smov w0, v3.h[2] +; CHECK-GI-NEXT: smov w1, v3.h[3] +; CHECK-GI-NEXT: smov w2, v3.h[4] +; CHECK-GI-NEXT: smov w3, v3.h[5] +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: smov w9, v0.h[7] +; CHECK-GI-NEXT: fmov s0, w14 +; CHECK-GI-NEXT: mov v0.h[1], w15 +; CHECK-GI-NEXT: smov w15, v1.h[6] +; CHECK-GI-NEXT: mov v0.h[2], w13 +; CHECK-GI-NEXT: sdiv w9, w9, w16 +; CHECK-GI-NEXT: smov w16, v1.h[0] +; CHECK-GI-NEXT: mov v0.h[3], w12 +; CHECK-GI-NEXT: smov w12, v1.h[7] +; CHECK-GI-NEXT: mov v0.h[4], w11 +; CHECK-GI-NEXT: sdiv w16, w16, w17 +; CHECK-GI-NEXT: smov w17, v1.h[1] +; CHECK-GI-NEXT: mov v0.h[5], w10 +; CHECK-GI-NEXT: mov v0.h[6], w8 +; CHECK-GI-NEXT: sdiv w17, w17, w18 +; CHECK-GI-NEXT: smov w18, v1.h[2] +; CHECK-GI-NEXT: fmov s2, w16 +; CHECK-GI-NEXT: smov w16, v3.h[6] +; CHECK-GI-NEXT: mov v0.h[7], w9 +; CHECK-GI-NEXT: sdiv w18, w18, w0 +; CHECK-GI-NEXT: smov w0, v1.h[3] +; CHECK-GI-NEXT: mov v2.h[1], w17 +; CHECK-GI-NEXT: sdiv w0, w0, w1 +; CHECK-GI-NEXT: smov w1, v1.h[4] +; CHECK-GI-NEXT: mov v2.h[2], w18 +; CHECK-GI-NEXT: sdiv w1, w1, w2 +; CHECK-GI-NEXT: smov w2, v1.h[5] +; CHECK-GI-NEXT: mov v2.h[3], w0 +; CHECK-GI-NEXT: sdiv w14, w2, w3 +; CHECK-GI-NEXT: mov v2.h[4], w1 +; CHECK-GI-NEXT: sdiv w13, w15, w16 +; CHECK-GI-NEXT: smov w15, v3.h[7] +; CHECK-GI-NEXT: mov v2.h[5], w14 +; CHECK-GI-NEXT: sdiv w10, w12, w15 +; CHECK-GI-NEXT: mov v2.h[6], w13 +; CHECK-GI-NEXT: mov v2.h[7], w10 +; CHECK-GI-NEXT: mov v1.16b, v2.16b +; CHECK-GI-NEXT: ret + %zero_and_ones = shufflevector <16 x i16> zeroinitializer, <16 x i16> splat (i16 1), <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %div = sdiv <16 x i16> %x, %zero_and_ones + ret <16 x i16> %div +} diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir index 5933c5d..b8302e6 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir @@ -380,10 +380,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 - ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1) - ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1) + ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16 ; CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 0 :: (load (s16) from %stack.4) ; CHECK-NEXT: frame-destroy SEH_SavePReg 4, 0 ; CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 1 :: (load (s16) from %stack.3) @@ -430,10 +428,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 - ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1) - ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1) + ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 0 ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3) @@ -557,10 +553,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 ; CHECK-NEXT: $x21, $lr = frame-destroy LDPXi $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3) ; CHECK-NEXT: frame-destroy SEH_SaveRegP 21, 30, 16 - ; CHECK-NEXT: $x19, $x20 = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.4), (load (s64) from %stack.5) - ; CHECK-NEXT: frame-destroy SEH_SaveRegP 19, 20, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32 + ; CHECK-NEXT: early-clobber $sp, $x19, $x20 = frame-destroy LDPXpost $sp, 4 :: (load (s64) from %stack.4), (load (s64) from %stack.5) + ; CHECK-NEXT: frame-destroy SEH_SaveRegP_X 19, 20, -32 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.21) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2 ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.20) @@ -745,10 +739,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 ; CHECK-NEXT: frame-destroy SEH_SetFP - ; CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.2), (load (s64) from %stack.3) - ; CHECK-NEXT: frame-destroy SEH_SaveFPLR 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3) + ; CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2 ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18) @@ -869,10 +861,8 @@ body: | ; CHECK-NEXT: frame-destroy SEH_EpilogStart ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 7, implicit $vg ; CHECK-NEXT: frame-destroy SEH_AllocZ 7 - ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.6) - ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0 - ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 - ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16 + ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.6) + ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16 ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.8) ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 1 ; CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.7) diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll index ecd48d6..149b4c4 100644 --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -290,8 +290,7 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) { define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) { ; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, w1 -; CHECK-NEXT: cmn w1, w0 +; CHECK-NEXT: adds w8, w1, w0 ; CHECK-NEXT: csinv w0, w8, wzr, lo ; CHECK-NEXT: ret %noty = xor i32 %y, -1 @@ -331,8 +330,7 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) { define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) { ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, x1 -; CHECK-NEXT: cmn x1, x0 +; CHECK-NEXT: adds x8, x1, x0 ; CHECK-NEXT: csinv x0, x8, xzr, lo ; CHECK-NEXT: ret %noty = xor i64 %y, -1 diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index b6dee97e..b8d6c88 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -732,6 +732,247 @@ exit: ret void } +; This example corresponds to: +; +; __arm_agnostic("sme_za_state") void try_catch_agnostic_za_invoke() +; { +; try { +; agnostic_za_call(); +; } catch(...) { +; } +; } +; +; In this example we preserve all SME state enabled by PSTATE.ZA using +; `__arm_sme_save` before agnostic_za_call(). This is because on all normal +; returns from an agnostic ZA function ZA state should be preserved. That means +; we need to make sure ZA state is saved in case agnostic_za_call() throws, and +; we need to restore ZA state after unwinding to the catch block. + +define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_agnostic_za_invoke: +; CHECK: .Lfunc_begin5: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception5 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .Ltmp15: // EH_LABEL +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl agnostic_za_call +; CHECK-NEXT: .Ltmp16: // EH_LABEL +; CHECK-NEXT: .LBB5_1: // %exit +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_2: // %catch +; CHECK-NEXT: .Ltmp17: // EH_LABEL +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB5_1 +; +; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke: +; CHECK-SDAG: .Lfunc_begin5: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception5 +; CHECK-SDAG-NEXT: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: .Ltmp15: // EH_LABEL +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl agnostic_za_call +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: .Ltmp16: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB5_1: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB5_2: // %catch +; CHECK-SDAG-NEXT: .Ltmp17: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: b .LBB5_1 +entry: + invoke void @agnostic_za_call() + to label %exit unwind label %catch + +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + +; This is the same `try_catch_agnostic_za_invoke`, but shows a lazy save would +; also need to be committed in a shared-ZA function calling an agnostic-ZA function. +define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_inout_za_agnostic_za_callee: +; CHECK: .Lfunc_begin6: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception6 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-16] +; CHECK-NEXT: .Ltmp18: // EH_LABEL +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl agnostic_za_call +; CHECK-NEXT: .Ltmp19: // EH_LABEL +; CHECK-NEXT: .LBB6_1: // %exit +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB6_3 +; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_3: // %exit +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_4: // %catch +; CHECK-NEXT: .Ltmp20: // EH_LABEL +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB6_1 +; +; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee: +; CHECK-SDAG: .Lfunc_begin6: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception6 +; CHECK-SDAG-NEXT: // %bb.0: // %entry +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: .Ltmp18: // EH_LABEL +; CHECK-SDAG-NEXT: sub x19, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl agnostic_za_call +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB6_2 +; CHECK-SDAG-NEXT: // %bb.1: // %entry +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB6_2: // %entry +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .Ltmp19: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB6_3: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB6_4: // %catch +; CHECK-SDAG-NEXT: .Ltmp20: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB6_6 +; CHECK-SDAG-NEXT: // %bb.5: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB6_6: // %catch +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB6_8 +; CHECK-SDAG-NEXT: // %bb.7: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB6_8: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB6_10 +; CHECK-SDAG-NEXT: // %bb.9: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB6_10: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b .LBB6_3 +entry: + invoke void @agnostic_za_call() + to label %exit unwind label %catch + +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + declare ptr @__cxa_allocate_exception(i64) declare void @__cxa_throw(ptr, ptr, ptr) declare ptr @__cxa_begin_catch(ptr) @@ -742,3 +983,4 @@ declare void @may_throw() declare void @shared_za_call() "aarch64_inout_za" declare void @noexcept_shared_za_call() "aarch64_inout_za" declare void @shared_zt0_call() "aarch64_inout_zt0" +declare void @agnostic_za_call() "aarch64_za_state_agnostic" diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll index 53ac934..3ba4a1c 100644 --- a/llvm/test/CodeGen/AArch64/win-sve.ll +++ b/llvm/test/CodeGen/AArch64/win-sve.ll @@ -75,10 +75,8 @@ define i32 @f(<vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -234,10 +232,8 @@ define void @f2(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -384,10 +380,8 @@ define void @f3(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -538,10 +532,8 @@ define void @f4(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -702,10 +694,8 @@ define void @f5(i64 %n, <vscale x 2 x i64> %x) { ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -860,10 +850,10 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: stur x0, [x8, #16] ; CHECK-NEXT: addvl x8, x29, #18 ; CHECK-NEXT: ldr x1, [x8, #32] -; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .Ltmp0: // EH_LABEL ; CHECK-NEXT: add x0, x19, #0 ; CHECK-NEXT: bl g6 -; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: .Ltmp1: // EH_LABEL ; CHECK-NEXT: // %bb.1: // %invoke.cont ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: add sp, sp, #64 @@ -872,10 +862,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -932,8 +920,6 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: .seh_save_preg p14, 10 ; CHECK-NEXT: ldr p15, [sp, #11, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: .seh_save_preg p15, 11 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: .seh_allocz 18 ; CHECK-NEXT: add sp, sp, #16 @@ -1024,10 +1010,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr ; CHECK-NEXT: .seh_save_fplr 16 ; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x28, 8 -; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x19, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x19, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1144,10 +1128,8 @@ define void @f8(<vscale x 2 x i64> %v) { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1196,14 +1178,10 @@ define void @f9(<vscale x 2 x i64> %v, ...) { ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: .seh_stackalloc 64 ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .seh_allocz 1 ; CHECK-NEXT: add sp, sp, #64 @@ -1301,10 +1279,8 @@ define void @f10(i64 %n, <vscale x 2 x i64> %x) "frame-pointer"="all" { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 32 ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 2 ; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload @@ -1390,10 +1366,8 @@ define i32 @f11(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: str d0, [sp, #8] ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1431,10 +1405,8 @@ define i32 @f12(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .seh_allocz 1 -; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x30, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1475,10 +1447,8 @@ define i32 @f13(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" { ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 32 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1521,10 +1491,8 @@ define i32 @f14(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" { ; CHECK-NEXT: .seh_allocz 1 ; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 32 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1572,10 +1540,8 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) { ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 -; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload -; CHECK-NEXT: .seh_save_reg x28, 0 -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x28, 16 ; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_zreg z8, 0 ; CHECK-NEXT: addvl sp, sp, #1 @@ -1594,3 +1560,53 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) { store i32 %i, ptr %a ret void } + +declare ptr @llvm.swift.async.context.addr() + +define void @f16(ptr swiftasync %ctx, <vscale x 2 x i64> %foo) { +; CHECK-LABEL: f16: +; CHECK: .seh_proc f16 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: orr x29, x29, #0x1000000000000000 +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .seh_allocz 1 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_zreg z8, 0 +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 8 +; CHECK-NEXT: str x22, [sp] +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: add x29, sp, #8 +; CHECK-NEXT: .seh_add_fp 8 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr x8, [x22] +; CHECK-NEXT: stur x8, [x29, #-8] +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 +; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 8 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_zreg z8, 0 +; CHECK-NEXT: and x29, x29, #0xefffffffffffffff +; CHECK-NEXT: .seh_nop +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .seh_allocz 1 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc + tail call void asm sideeffect "", "~{z8}"() + %1 = load ptr, ptr %ctx, align 8 + %2 = tail call ptr @llvm.swift.async.context.addr() + store ptr %1, ptr %2, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 549af87..a43bfb5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; CI-NEXT: s_cbranch_vccz .LBB9_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else20 ; CI-NEXT: s_and_b32 s2, s0, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB9_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute19 ; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 @@ -1083,10 +1083,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: .LBB9_5: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1102,7 +1102,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_7 ; CI-NEXT: .LBB9_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1125,7 +1125,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; CI-NEXT: s_cbranch_vccz .LBB9_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: s_and_b32 s4, s2, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -1136,7 +1136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s4, s4, 1 ; CI-NEXT: s_cmp_lg_u32 s4, 0 ; CI-NEXT: s_cbranch_scc1 .LBB9_16 -; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 @@ -1161,10 +1161,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: .LBB9_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_15 ; CI-NEXT: .LBB9_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1237,7 +1237,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; VI-NEXT: s_cbranch_vccz .LBB9_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else20 ; VI-NEXT: s_and_b32 s2, s0, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1248,7 +1248,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB9_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute19 ; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; VI-NEXT: v_ldexp_f32 v1, v3, 1 @@ -1273,10 +1273,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: .LBB9_5: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_7 ; VI-NEXT: .LBB9_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1315,7 +1315,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; VI-NEXT: s_cbranch_vccz .LBB9_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_and_b32 s3, s4, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1326,7 +1326,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s3, s3, 1 ; VI-NEXT: s_cmp_lg_u32 s3, 0 ; VI-NEXT: s_cbranch_scc1 .LBB9_16 -; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: ; %bb.11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; VI-NEXT: v_ldexp_f32 v2, v4, 1 @@ -1351,10 +1351,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: .LBB9_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1370,7 +1370,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_15 ; VI-NEXT: .LBB9_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1425,7 +1425,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; CI-NEXT: s_cbranch_vccz .LBB10_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else86 ; CI-NEXT: s_and_b32 s0, s4, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1436,7 +1436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s0, s0, 1 ; CI-NEXT: s_cmp_lg_u32 s0, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute85 ; CI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1 @@ -1461,10 +1461,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: .LBB10_5: ; %frem.loop_body93 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1480,7 +1480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_7 ; CI-NEXT: .LBB10_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1503,7 +1503,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; CI-NEXT: s_cbranch_vccz .LBB10_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else53 ; CI-NEXT: s_and_b32 s1, s6, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1514,7 +1514,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s1, s1, 1 ; CI-NEXT: s_cmp_lg_u32 s1, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_16 -; CI-NEXT: ; %bb.11: ; %frem.compute19 +; CI-NEXT: ; %bb.11: ; %frem.compute52 ; CI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1 @@ -1539,10 +1539,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: .LBB10_13: ; %frem.loop_body60 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1558,7 +1558,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_15 ; CI-NEXT: .LBB10_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1579,7 +1579,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; CI-NEXT: s_cbranch_vccz .LBB10_18 -; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: ; %bb.17: ; %frem.else20 ; CI-NEXT: s_and_b32 s1, s5, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 ; CI-NEXT: v_mov_b32_e32 v2, s1 @@ -1590,7 +1590,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s1, s1, 1 ; CI-NEXT: s_cmp_lg_u32 s1, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_24 -; CI-NEXT: ; %bb.19: ; %frem.compute52 +; CI-NEXT: ; %bb.19: ; %frem.compute19 ; CI-NEXT: v_frexp_mant_f32_e32 v5, v3 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 ; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1 @@ -1615,10 +1615,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 ; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_22 -; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7 ; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: .LBB10_21: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v7, v6 ; CI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -1634,7 +1634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_23 ; CI-NEXT: .LBB10_22: ; CI-NEXT: v_mov_b32_e32 v7, v6 -; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4 ; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 ; CI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; implicit-def: $vgpr3 ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 ; CI-NEXT: s_cbranch_vccz .LBB10_26 -; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: s_and_b32 s1, s7, 0x8000 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1668,7 +1668,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s1, s1, 1 ; CI-NEXT: s_cmp_lg_u32 s1, 0 ; CI-NEXT: s_cbranch_scc1 .LBB10_32 -; CI-NEXT: ; %bb.27: ; %frem.compute85 +; CI-NEXT: ; %bb.27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e32 v6, v4 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 ; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1 @@ -1693,10 +1693,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 ; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_30 -; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8 ; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 -; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: .LBB10_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v8, v7 ; CI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -1712,7 +1712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_31 ; CI-NEXT: .LBB10_30: ; CI-NEXT: v_mov_b32_e32 v8, v7 -; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5 ; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 ; CI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -1791,7 +1791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1 ; VI-NEXT: s_cbranch_vccz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else86 ; VI-NEXT: s_and_b32 s0, s8, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1802,7 +1802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute85 ; VI-NEXT: v_frexp_mant_f32_e32 v3, v1 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1 ; VI-NEXT: v_ldexp_f32 v1, v3, 1 @@ -1827,10 +1827,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: .LBB10_5: ; %frem.loop_body93 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -1846,7 +1846,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_7 ; VI-NEXT: .LBB10_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -1869,7 +1869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2 ; VI-NEXT: s_cbranch_vccz .LBB10_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else53 ; VI-NEXT: s_and_b32 s0, s4, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2 ; VI-NEXT: v_mov_b32_e32 v1, s0 @@ -1880,7 +1880,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_16 -; VI-NEXT: ; %bb.11: ; %frem.compute19 +; VI-NEXT: ; %bb.11: ; %frem.compute52 ; VI-NEXT: v_frexp_mant_f32_e32 v4, v2 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2 ; VI-NEXT: v_ldexp_f32 v2, v4, 1 @@ -1905,10 +1905,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: .LBB10_13: ; %frem.loop_body60 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -1924,7 +1924,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_15 ; VI-NEXT: .LBB10_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -1945,7 +1945,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; VI-NEXT: s_cbranch_vccz .LBB10_18 -; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: ; %bb.17: ; %frem.else20 ; VI-NEXT: s_and_b32 s0, s9, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1956,7 +1956,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_24 -; VI-NEXT: ; %bb.19: ; %frem.compute52 +; VI-NEXT: ; %bb.19: ; %frem.compute19 ; VI-NEXT: v_frexp_mant_f32_e32 v5, v3 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3 ; VI-NEXT: v_ldexp_f32 v3, v5, 1 @@ -1981,10 +1981,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4 ; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_22 -; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 -; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: .LBB10_21: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -2000,7 +2000,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_23 ; VI-NEXT: .LBB10_22: ; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 ; VI-NEXT: v_ldexp_f32 v4, v7, v4 ; VI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -2023,7 +2023,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4 ; VI-NEXT: s_cbranch_vccz .LBB10_26 -; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_and_b32 s0, s12, 0x8000 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4 ; VI-NEXT: v_mov_b32_e32 v3, s0 @@ -2034,7 +2034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s0, s0, 1 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cbranch_scc1 .LBB10_32 -; VI-NEXT: ; %bb.27: ; %frem.compute85 +; VI-NEXT: ; %bb.27: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e32 v6, v4 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4 ; VI-NEXT: v_ldexp_f32 v4, v6, 1 @@ -2059,10 +2059,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5 ; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_30 -; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8 ; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 -; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: .LBB10_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v8, v7 ; VI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -2078,7 +2078,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_31 ; VI-NEXT: .LBB10_30: ; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5 ; VI-NEXT: v_ldexp_f32 v5, v8, v5 ; VI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -2144,7 +2144,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| ; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_cbranch_vccz .LBB11_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: s_and_b32 s6, s2, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2156,7 +2156,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s6, s6, 1 ; CI-NEXT: s_cmp_lg_u32 s6, 0 ; CI-NEXT: s_cbranch_scc1 .LBB11_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 ; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 @@ -2181,10 +2181,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: .LBB11_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2200,7 +2200,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_7 ; CI-NEXT: .LBB11_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2219,7 +2219,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s6, 1 ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_cbranch_vccz .LBB11_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: s_and_b32 s6, s3, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v2, s5 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2231,7 +2231,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s6, s6, 1 ; CI-NEXT: s_cmp_lg_u32 s6, 0 ; CI-NEXT: s_cbranch_scc1 .LBB11_16 -; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5| ; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 ; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 @@ -2256,10 +2256,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: .LBB11_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_15 ; CI-NEXT: .LBB11_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2317,7 +2317,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0| ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_cbranch_vccz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: s_and_b32 s6, s2, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2329,7 +2329,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s6, s6, 1 ; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc1 .LBB11_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; VI-NEXT: v_ldexp_f32 v1, v1, 1 ; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0 @@ -2354,10 +2354,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: .LBB11_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2373,7 +2373,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_7 ; VI-NEXT: .LBB11_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2392,7 +2392,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s6, 1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_cbranch_vccz .LBB11_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_and_b32 s6, s3, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2404,7 +2404,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s6, s6, 1 ; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc1 .LBB11_16 -; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: ; %bb.11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5| ; VI-NEXT: v_ldexp_f32 v2, v2, 1 ; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0 @@ -2429,10 +2429,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: .LBB11_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2448,7 +2448,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_15 ; VI-NEXT: .LBB11_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2498,7 +2498,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| ; CI-NEXT: ; implicit-def: $vgpr0 ; CI-NEXT: s_cbranch_vccz .LBB12_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else78 ; CI-NEXT: s_and_b32 s2, s4, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v1, s8 ; CI-NEXT: v_mov_b32_e32 v0, s4 @@ -2510,7 +2510,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute77 ; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8| ; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1 ; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 @@ -2535,10 +2535,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5 ; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: .LBB12_5: ; %frem.loop_body85 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v5, v4 ; CI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2554,7 +2554,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_7 ; CI-NEXT: .LBB12_6: ; CI-NEXT: v_mov_b32_e32 v5, v4 -; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2 ; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2 ; CI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2573,7 +2573,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr1 ; CI-NEXT: s_cbranch_vccz .LBB12_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else47 ; CI-NEXT: s_and_b32 s2, s5, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v2, s9 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2585,7 +2585,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_16 -; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: ; %bb.11: ; %frem.compute46 ; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1 ; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 @@ -2610,10 +2610,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6 ; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 -; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: .LBB12_13: ; %frem.loop_body54 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v6, v5 ; CI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2629,7 +2629,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_15 ; CI-NEXT: .LBB12_14: ; CI-NEXT: v_mov_b32_e32 v6, v5 -; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3 ; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3 ; CI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2648,7 +2648,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr2 ; CI-NEXT: s_cbranch_vccz .LBB12_18 -; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: ; %bb.17: ; %frem.else16 ; CI-NEXT: s_and_b32 s2, s6, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v3, s10 ; CI-NEXT: v_mov_b32_e32 v2, s6 @@ -2660,7 +2660,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_24 -; CI-NEXT: ; %bb.19: ; %frem.compute46 +; CI-NEXT: ; %bb.19: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10| ; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1 ; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 @@ -2685,10 +2685,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 ; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_22 -; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7 ; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 -; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: .LBB12_21: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v7, v6 ; CI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -2704,7 +2704,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_23 ; CI-NEXT: .LBB12_22: ; CI-NEXT: v_mov_b32_e32 v7, v6 -; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4 ; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4 ; CI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -2723,7 +2723,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr3 ; CI-NEXT: s_cbranch_vccz .LBB12_26 -; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: s_and_b32 s2, s7, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v4, s11 ; CI-NEXT: v_mov_b32_e32 v3, s7 @@ -2735,7 +2735,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB12_32 -; CI-NEXT: ; %bb.27: ; %frem.compute77 +; CI-NEXT: ; %bb.27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11| ; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1 ; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 @@ -2760,10 +2760,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 ; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_30 -; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8 ; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 -; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: .LBB12_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v8, v7 ; CI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -2779,7 +2779,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_31 ; CI-NEXT: .LBB12_30: ; CI-NEXT: v_mov_b32_e32 v8, v7 -; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5 ; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5 ; CI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -2829,7 +2829,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0| ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: s_cbranch_vccz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else78 ; VI-NEXT: s_and_b32 s2, s4, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2841,7 +2841,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute77 ; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8| ; VI-NEXT: v_ldexp_f32 v1, v1, 1 ; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0 @@ -2866,10 +2866,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2 ; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: .LBB12_5: ; %frem.loop_body85 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v5, v4 ; VI-NEXT: v_mul_f32_e32 v4, v5, v3 @@ -2885,7 +2885,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_7 ; VI-NEXT: .LBB12_6: ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2 ; VI-NEXT: v_ldexp_f32 v2, v5, v2 ; VI-NEXT: v_mul_f32_e32 v3, v2, v3 @@ -2904,7 +2904,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: s_cbranch_vccz .LBB12_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else47 ; VI-NEXT: s_and_b32 s2, s5, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v2, s9 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2916,7 +2916,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_16 -; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: ; %bb.11: ; %frem.compute46 ; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; VI-NEXT: v_ldexp_f32 v2, v2, 1 ; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0 @@ -2941,10 +2941,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3 ; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: .LBB12_13: ; %frem.loop_body54 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v6, v5 ; VI-NEXT: v_mul_f32_e32 v5, v6, v4 @@ -2960,7 +2960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_15 ; VI-NEXT: .LBB12_14: ; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3 ; VI-NEXT: v_ldexp_f32 v3, v6, v3 ; VI-NEXT: v_mul_f32_e32 v4, v3, v4 @@ -2979,7 +2979,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr2 ; VI-NEXT: s_cbranch_vccz .LBB12_18 -; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: ; %bb.17: ; %frem.else16 ; VI-NEXT: s_and_b32 s2, s6, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v3, s10 ; VI-NEXT: v_mov_b32_e32 v2, s6 @@ -2991,7 +2991,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_24 -; VI-NEXT: ; %bb.19: ; %frem.compute46 +; VI-NEXT: ; %bb.19: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10| ; VI-NEXT: v_ldexp_f32 v3, v3, 1 ; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0 @@ -3016,10 +3016,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4 ; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_22 -; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 -; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: .LBB12_21: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_mul_f32_e32 v6, v7, v5 @@ -3035,7 +3035,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_23 ; VI-NEXT: .LBB12_22: ; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4 ; VI-NEXT: v_ldexp_f32 v4, v7, v4 ; VI-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -3054,7 +3054,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr3 ; VI-NEXT: s_cbranch_vccz .LBB12_26 -; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_and_b32 s2, s7, 0x80000000 ; VI-NEXT: v_mov_b32_e32 v4, s11 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -3066,7 +3066,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB12_32 -; VI-NEXT: ; %bb.27: ; %frem.compute77 +; VI-NEXT: ; %bb.27: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11| ; VI-NEXT: v_ldexp_f32 v4, v4, 1 ; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0 @@ -3091,10 +3091,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5 ; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_30 -; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8 ; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9 -; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: .LBB12_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v8, v7 ; VI-NEXT: v_mul_f32_e32 v7, v8, v6 @@ -3110,7 +3110,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_31 ; VI-NEXT: .LBB12_30: ; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5 ; VI-NEXT: v_ldexp_f32 v5, v8, v5 ; VI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -3169,7 +3169,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| ; CI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CI-NEXT: s_cbranch_vccz .LBB13_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| @@ -3187,7 +3187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB13_8 -; CI-NEXT: ; %bb.3: ; %frem.compute +; CI-NEXT: ; %bb.3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| @@ -3210,10 +3210,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_6 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6 ; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7 -; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: .LBB13_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v7, v5 ; CI-NEXT: v_mov_b32_e32 v6, v4 @@ -3232,7 +3232,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: .LBB13_6: ; CI-NEXT: v_mov_b32_e32 v7, v5 ; CI-NEXT: v_mov_b32_e32 v6, v4 -; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9 ; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 ; CI-NEXT: s_mov_b32 s2, 0 @@ -3256,7 +3256,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_mov_b32 s2, 1 ; CI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CI-NEXT: s_cbranch_vccz .LBB13_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| @@ -3274,7 +3274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_xor_b32 s2, s2, 1 ; CI-NEXT: s_cmp_lg_u32 s2, 0 ; CI-NEXT: s_cbranch_scc1 .LBB13_16 -; CI-NEXT: ; %bb.11: ; %frem.compute15 +; CI-NEXT: ; %bb.11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| ; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| @@ -3297,10 +3297,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_14 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8 ; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9 -; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: .LBB13_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mov_b32_e32 v8, v6 @@ -3319,7 +3319,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: .LBB13_14: ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mov_b32_e32 v8, v6 -; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11 ; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 ; CI-NEXT: s_mov_b32 s2, 0 @@ -3371,7 +3371,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]| ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 ; VI-NEXT: s_cbranch_vccz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]| @@ -3389,7 +3389,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB13_8 -; VI-NEXT: ; %bb.3: ; %frem.compute +; VI-NEXT: ; %bb.3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]| @@ -3412,10 +3412,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9 ; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_6 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6 ; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7 -; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: .LBB13_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 @@ -3434,7 +3434,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: .LBB13_6: ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9 ; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4 ; VI-NEXT: s_mov_b32 s2, 0 @@ -3458,7 +3458,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_mov_b32 s2, 1 ; VI-NEXT: ; implicit-def: $vgpr2_vgpr3 ; VI-NEXT: s_cbranch_vccz .LBB13_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]| @@ -3476,7 +3476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_xor_b32 s2, s2, 1 ; VI-NEXT: s_cmp_lg_u32 s2, 0 ; VI-NEXT: s_cbranch_scc1 .LBB13_16 -; VI-NEXT: ; %bb.11: ; %frem.compute15 +; VI-NEXT: ; %bb.11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]| ; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]| @@ -3499,10 +3499,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11 ; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_14 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8 ; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9 -; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: .LBB13_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mov_b32_e32 v8, v6 @@ -3521,7 +3521,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: .LBB13_14: ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11 ; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6 ; VI-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll index f96a6f7..b239c46 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll @@ -1,13 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}kernel_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_default() #0 { +; GCN-LABEL: kernel_ieee_mode_default: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 6 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 0 +; GCN-NEXT: granulated_wavefront_sgpr_count = 0 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 240 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 1 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GCN-NEXT: user_sgpr_count = 12 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 0 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 0 +; GCN-NEXT: workitem_private_segment_byte_size = 0 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 16 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 4 +; GCN-NEXT: workitem_vgpr_count = 2 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}kernel_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_on() #1 { +; GCN-LABEL: kernel_ieee_mode_on: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 6 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 0 +; GCN-NEXT: granulated_wavefront_sgpr_count = 0 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 240 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 1 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GCN-NEXT: user_sgpr_count = 12 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 0 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 0 +; GCN-NEXT: workitem_private_segment_byte_size = 0 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 16 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 4 +; GCN-NEXT: workitem_vgpr_count = 2 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}kernel_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_kernel void @kernel_ieee_mode_off() #2 { +; GCN-LABEL: kernel_ieee_mode_off: +; GCN: .amd_kernel_code_t +; GCN-NEXT: amd_code_version_major = 1 +; GCN-NEXT: amd_code_version_minor = 2 +; GCN-NEXT: amd_machine_kind = 1 +; GCN-NEXT: amd_machine_version_major = 6 +; GCN-NEXT: amd_machine_version_minor = 0 +; GCN-NEXT: amd_machine_version_stepping = 0 +; GCN-NEXT: kernel_code_entry_byte_offset = 256 +; GCN-NEXT: kernel_code_prefetch_byte_size = 0 +; GCN-NEXT: granulated_workitem_vgpr_count = 0 +; GCN-NEXT: granulated_wavefront_sgpr_count = 0 +; GCN-NEXT: priority = 0 +; GCN-NEXT: float_mode = 240 +; GCN-NEXT: priv = 0 +; GCN-NEXT: enable_dx10_clamp = 1 +; GCN-NEXT: debug_mode = 0 +; GCN-NEXT: enable_ieee_mode = 0 +; GCN-NEXT: enable_wgp_mode = 0 +; GCN-NEXT: enable_mem_ordered = 0 +; GCN-NEXT: enable_fwd_progress = 0 +; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GCN-NEXT: user_sgpr_count = 12 +; GCN-NEXT: enable_trap_handler = 0 +; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_y = 1 +; GCN-NEXT: enable_sgpr_workgroup_id_z = 1 +; GCN-NEXT: enable_sgpr_workgroup_info = 0 +; GCN-NEXT: enable_vgpr_workitem_id = 2 +; GCN-NEXT: enable_exception_msb = 0 +; GCN-NEXT: granulated_lds_size = 0 +; GCN-NEXT: enable_exception = 0 +; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 +; GCN-NEXT: enable_sgpr_dispatch_ptr = 1 +; GCN-NEXT: enable_sgpr_queue_ptr = 1 +; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GCN-NEXT: enable_sgpr_dispatch_id = 1 +; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 +; GCN-NEXT: enable_sgpr_private_segment_size = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NEXT: enable_wavefront_size32 = 0 +; GCN-NEXT: enable_ordered_append_gds = 0 +; GCN-NEXT: private_element_size = 1 +; GCN-NEXT: is_ptr64 = 1 +; GCN-NEXT: is_dynamic_callstack = 0 +; GCN-NEXT: is_debug_enabled = 0 +; GCN-NEXT: is_xnack_enabled = 0 +; GCN-NEXT: workitem_private_segment_byte_size = 0 +; GCN-NEXT: workgroup_group_segment_byte_size = 0 +; GCN-NEXT: gds_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 16 +; GCN-NEXT: workgroup_fbarrier_count = 0 +; GCN-NEXT: wavefront_sgpr_count = 4 +; GCN-NEXT: workitem_vgpr_count = 2 +; GCN-NEXT: reserved_vgpr_first = 0 +; GCN-NEXT: reserved_vgpr_count = 0 +; GCN-NEXT: reserved_sgpr_first = 0 +; GCN-NEXT: reserved_sgpr_count = 0 +; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 +; GCN-NEXT: kernarg_segment_alignment = 4 +; GCN-NEXT: group_segment_alignment = 4 +; GCN-NEXT: private_segment_alignment = 4 +; GCN-NEXT: wavefront_size = 6 +; GCN-NEXT: call_convention = -1 +; GCN-NEXT: runtime_loader_kernel_symbol = 0 +; GCN-NEXT: .end_amd_kernel_code_t +; GCN-NEXT: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 { ret void } -; GCN-LABEL: {{^}}func_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define void @func_ieee_mode_default() #0 { +; GCN-LABEL: func_ieee_mode_default: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}func_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define void @func_ieee_mode_on() #1 { +; GCN-LABEL: func_ieee_mode_on: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}func_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define void @func_ieee_mode_off() #2 { +; GCN-LABEL: func_ieee_mode_off: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 { ret void } -; GCN-LABEL: {{^}}cs_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_default() #0 { +; GCN-LABEL: cs_ieee_mode_default: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}cs_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_on() #1 { +; GCN-LABEL: cs_ieee_mode_on: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}cs_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_cs void @cs_ieee_mode_off() #2 { +; GCN-LABEL: cs_ieee_mode_off: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 { ret void } -; GCN-LABEL: {{^}}ps_ieee_mode_default: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_default() #0 { +; GCN-LABEL: ps_ieee_mode_default: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 { ret void } -; GCN-LABEL: {{^}}ps_ieee_mode_on: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] -; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] -; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_on() #1 { +; GCN-LABEL: ps_ieee_mode_on: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) @@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 { ret void } -; GCN-LABEL: {{^}}ps_ieee_mode_off: -; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] -; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-NOT: [[VAL0]] -; GCN-NOT: [[VAL1]] -; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]] -; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_off() #2 { +; GCN-LABEL: ps_ieee_mode_off: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_min_f32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm %val0 = load volatile float, ptr addrspace(1) poison %val1 = load volatile float, ptr addrspace(1) poison %min = call float @llvm.minnum.f32(float %val0, float %val1) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index df9c97f..117af95 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -6551,271 +6551,205 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: @@ -15709,61 +15643,61 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -15778,121 +15712,123 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15903,215 +15839,179 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4 ; GFX11-TRUE16-NEXT: .LBB14_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -16133,433 +16033,329 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -42692,271 +42488,205 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: @@ -53003,61 +52733,61 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -53072,121 +52802,123 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -53197,215 +52929,179 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4 ; GFX11-TRUE16-NEXT: .LBB38_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -53427,433 +53123,329 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -78968,271 +78560,205 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: @@ -88136,61 +87662,61 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -88205,121 +87731,123 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -88330,215 +87858,179 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4 ; GFX11-TRUE16-NEXT: .LBB58_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -88560,433 +88052,329 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -113114,271 +112502,205 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: @@ -123405,61 +122727,61 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 @@ -123474,121 +122796,123 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -123599,215 +122923,179 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4 ; GFX11-TRUE16-NEXT: .LBB74_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l ; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l ; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 @@ -123829,433 +123117,329 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -161654,179 +160838,182 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:112 -; GFX11-TRUE16-NEXT: s_clause 0x18 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124 +; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v98, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr140_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr155_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr154_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 @@ -161835,136 +161022,136 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v99 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v99 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v81 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v81 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v98 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v80 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[98:99] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v1.l +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[80:81] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v127.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v142.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v143.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v141.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v136.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v153.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v139.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v155.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v154.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v22.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v24.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v26.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v98.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v98.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v99.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v80.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v81.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v81.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 @@ -161980,7 +161167,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81 ; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 @@ -162019,10 +161206,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v17, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v37, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v37, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 @@ -162036,97 +161223,101 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v135.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v33, v35, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v17, v34, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v147.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v17, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v151, v17, v33 :: v_dual_and_b32 v18, 0xffff0000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v18, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v33, v20, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v151.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v34 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v149, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v19, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v149.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v17, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v17, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v160.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v19, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v161.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v33 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v17, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v151.h ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v160, v17, v23 :: v_dual_lshlrev_b32 v21, 16, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -162139,8 +161330,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v161, v19, v23 :: v_dual_lshlrev_b32 v22, 16, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v28 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v163.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -162153,10 +161346,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v161.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v150.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -162169,10 +161361,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v49 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v49 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v35 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v162.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v165.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 @@ -162185,10 +161377,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v29 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v163.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v38 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[37:38] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v38 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 @@ -162201,14 +161393,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v99 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v51 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v81 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v167.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v99 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v81 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v53, v17, v24 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 @@ -162217,14 +161409,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v98 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v160.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v165.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v80 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v164.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v17, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v80 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_cndmask_b32 v52, v19, v24 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 @@ -162233,10 +161425,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v53 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v53 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v37 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v177.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v49 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v49 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 @@ -162249,10 +161441,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v167.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v17, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v17, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 @@ -162263,13 +161454,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v20, 0x40c00000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v55 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v179.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v20, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v19, v21, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -162282,11 +161472,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v162.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v177.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v17, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v181, v17, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -162301,9 +161490,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v65 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v65 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v2, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v166.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v181.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v2, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v3 @@ -162313,13 +161502,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v17, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v179.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v51 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v164.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v67 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v1, v18, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v176.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v51 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v1, v18, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6 @@ -162330,13 +161519,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v183.h ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[48:49] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[37:38] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v67 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v2, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v53 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v53 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v50 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v2, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 @@ -162350,28 +161539,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v56, v2, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v2, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v5, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v48 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v1, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v2, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v56.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v41.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v60, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v83, v1, v8 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v71, v1, v8 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 @@ -162380,29 +161568,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v60.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v166.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v46.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[52:53] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v79, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v72, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v79.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v178.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v74, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 @@ -162410,40 +161598,40 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v76.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v176.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[82:83] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v2, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v182.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v74.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v72.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v2, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[54:55] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[70:71] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[54:55] ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v106, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v106.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v104, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v104.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v106, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[96:97] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v180.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v3, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v104.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v106.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[84:85] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 @@ -162452,8 +161640,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v178.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v127, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v136, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 @@ -162461,19 +161649,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v125, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v139, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v40.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v3, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v139.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v130, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 @@ -162481,11 +161669,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v125.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v127.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v102.l, v136.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v40.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v142, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 @@ -162494,389 +161682,322 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v142.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v141, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v153.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v2, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v42.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[112:113] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v143, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[130:131] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v155, v7, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[66:67] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[68:69] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[68:69] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[64:65] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v134, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v141.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[33:34] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v129 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v129 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v143.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v134 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v134 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v128 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v113 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[133:134] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[128:129] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[35:36] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v133 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v113 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v112 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v97 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v97 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v96 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v83 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v83 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v82 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v69 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v68 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v154.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[50:51] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[35:36] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v131 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v155.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v148 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v148 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v131 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v130 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[147:148] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[48:49] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v147 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v103 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v103 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v102 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v85 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v84 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v71 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v70 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v69 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v69 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v68 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v67 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v66 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v65 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v65 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v64 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v54 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v52 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v55 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v52 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v181.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v152.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v64.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v139.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v180.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v143.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v141.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v183.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v140.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v177.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v140.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v40.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v138.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v136.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v182.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v127.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v125.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v41.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v123.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v179.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v137.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v121.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v56.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v126.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v107.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v42.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v123.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v79.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v111.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v91.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v60.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v109.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v75.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v5, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v106.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v95.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v5, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v61.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v111.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v72.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v109.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v46.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v107.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v105.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v92.l +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v89.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v136.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v79.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v106.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v77.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v75.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v74.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v91.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v153.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v62.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v139.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v61.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v59.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v155.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v57.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v154.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v47.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v44.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v142.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v138.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v137.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v126.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v150.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v124.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h ; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v76.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v93.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v5, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v127.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v89.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v45.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v104.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v78.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v120.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v5, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v142.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v73.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v5, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v105.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v125.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v63.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v5, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v122.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v120.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v110.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v108.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v95.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v94.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v93.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v90.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v143.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v58.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v90.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v5, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v141.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v47.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v74.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v5, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v124.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v5, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v59.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v122.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v5, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v110.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v44.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v5, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v78.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v167.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v76.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v166.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v73.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v63.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v177.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v176.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v58.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h ; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v108.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v183.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v5, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v150.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v94.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v34.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v5, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v180.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v92.l -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v5, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v160.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v88.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v5, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v151.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v77.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v5, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v26.l, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v162.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v72.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v5, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v26.l, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v161.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v62.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v5, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v164.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v57.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v5, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v29.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v163.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v46.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v5, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.l, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v166.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v43.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v5, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v41.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v5, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v176.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v182.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v5, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v33.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v167.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v181.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v5, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v33.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v34.l, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v5, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v56.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v179.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v45.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v42.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x18 -; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:192 -; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136 +; GFX11-TRUE16-NEXT: s_clause 0x1b +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:248 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -186713,69 +185834,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -186784,95 +185905,91 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] @@ -186883,345 +186000,283 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8: @@ -209415,69 +208470,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -209486,95 +208541,91 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] @@ -209585,345 +208636,283 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 64b5ecc..582f31b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -4125,19 +4125,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -4152,94 +4152,71 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8614,19 +8591,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -8641,94 +8618,71 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12703,19 +12657,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -12730,94 +12684,71 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16408,19 +16339,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -16435,94 +16366,71 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19833,19 +19741,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -19860,94 +19768,71 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22745,19 +22630,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -22772,94 +22657,71 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24960,19 +24822,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -24987,94 +24849,71 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index cb4b3bd..0a73571 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -6298,31 +6298,33 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -6335,48 +6337,43 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -6387,122 +6384,88 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13349,31 +13312,33 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -13386,48 +13351,43 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -13438,122 +13398,88 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19888,31 +19814,33 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -19925,48 +19853,43 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -19977,122 +19900,88 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25929,31 +25818,33 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -25966,48 +25857,43 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 @@ -26018,122 +25904,88 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 3aaf254..b622e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -3044,91 +3044,66 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: @@ -5025,39 +5000,41 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 @@ -5071,63 +5048,53 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -5140,147 +5107,110 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9991,91 +9921,66 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: @@ -11997,39 +11902,41 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 @@ -12043,63 +11950,53 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 @@ -12112,147 +12009,110 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16367,91 +16227,66 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: @@ -22484,91 +22319,66 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8: @@ -28791,39 +28601,38 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 @@ -28837,65 +28646,55 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 @@ -28906,146 +28705,110 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -30878,91 +30641,66 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8: @@ -32912,39 +32650,38 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 @@ -32958,65 +32695,55 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 @@ -33027,146 +32754,110 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -35022,91 +34713,66 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 632b03c..e6c7b1a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2279,17 +2279,13 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2301,13 +2297,9 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4530,17 +4522,13 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4552,13 +4540,9 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6487,17 +6471,13 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6509,13 +6489,9 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8138,17 +8114,13 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8160,13 +8132,9 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9502,17 +9470,13 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9524,13 +9488,9 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10212,17 +10172,13 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10234,13 +10190,9 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index d3fbba3..bff054f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -8921,133 +8921,98 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -12574,53 +12539,52 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -12633,98 +12597,82 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -12745,226 +12693,170 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23576,133 +23468,98 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -27358,53 +27215,52 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -27417,98 +27273,82 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -27529,226 +27369,170 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -37760,133 +37544,98 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -41418,53 +41167,52 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -41477,98 +41225,82 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -41589,226 +41321,170 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -50954,133 +50630,98 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -54638,53 +54279,52 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -54697,98 +54337,82 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 @@ -54809,226 +54433,170 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -64107,133 +63675,98 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -76401,133 +75934,98 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l ; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -85053,57 +84551,57 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -85111,29 +84609,29 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[3:4] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v3 @@ -85141,11 +84639,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v2.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v3.h @@ -85155,26 +84653,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v5.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v7.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v11.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v14.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5 @@ -85187,71 +84685,72 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v20, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v18, v23 :: v_dual_lshlrev_b32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v18, v23 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v23, v26, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v20 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v53.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v22, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v22, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v1, v20, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v1, v19 :: v_dual_and_b32 v2, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v4, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -85304,305 +84803,266 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v65.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v66, v4, v5 :: v_dual_lshlrev_b32 v5, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v5, 16, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v21 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v1, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v1, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v22 ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v3, v8 :: v_dual_and_b32 v3, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v3, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v1, v4 :: v_dual_add_f32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v1, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v68.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v67.h ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v67.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v23 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v2, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v68.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v2, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v82.h -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v1, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v26 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v80.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v26 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v3, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v80.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v71.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v2, v3 :: v_dual_add_f32 v2, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v97.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v6, v7 :: v_dual_and_b32 v5, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v96.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v3, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v33 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v112, v4, v6 :: v_dual_add_f32 v1, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v112.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v98.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] ; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v113, v4, v6 :: v_dual_add_f32 v6, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v2, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v113.h +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[23:24] ; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v7, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v4, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v103.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v113.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v115.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v2, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v117.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[37:38] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v39 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[38:39] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[32:33] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v38 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v131.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v50.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v1.l, v1.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v52.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v112.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v69.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v84.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v83.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v70.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v117.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.h, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.h, v19.l ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index ecc715c..11f90b9 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -3067,9 +3067,9 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3085,52 +3085,47 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6210,9 +6205,9 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6228,52 +6223,47 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9050,9 +9040,9 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9068,52 +9058,47 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11590,9 +11575,9 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -11608,52 +11593,47 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13809,9 +13789,9 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -13827,52 +13807,47 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15630,9 +15605,9 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -15648,52 +15623,47 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16934,9 +16904,9 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -16952,52 +16922,47 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 685e2fb..9a6ea1b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1104,16 +1104,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -1128,37 +1127,28 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true @@ -1166,36 +1156,26 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4254,16 +4234,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -4278,37 +4257,28 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -4316,36 +4286,26 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6909,12 +6869,12 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -6929,37 +6889,28 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true @@ -6967,36 +6918,26 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8669,12 +8610,12 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -8689,37 +8630,28 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true @@ -8727,36 +8659,26 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10079,12 +10001,12 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -10099,37 +10021,28 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true @@ -10137,36 +10050,26 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index cbf6b66..7dbbeaa 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -3632,13 +3632,9 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -3813,16 +3809,12 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec ; GFX1250-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v17.l ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v0.l, v0.h, 0xff bitop3:0xec +; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v1, off ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 26f204f..14897b6 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1771,33 +1771,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1] +; GFX11-TRUE16-NEXT: global_load_b32 v5, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v5.h, 9 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v5.h +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x900, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x900, v4.h ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 78a961e..415828f 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -4858,7 +4858,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: v_cvt_f16_f32_e32 v4, v2 ; SI-NEXT: s_cbranch_vccz .LBB9_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else20 ; SI-NEXT: v_bfi_b32 v7, s0, 0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6 @@ -4869,7 +4869,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB9_2: ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB9_3: ; %frem.compute +; SI-NEXT: .LBB9_3: ; %frem.compute19 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5 @@ -4905,10 +4905,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB9_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB9_5: ; %frem.loop_body +; SI-NEXT: .LBB9_5: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -4923,7 +4923,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB9_5 ; SI-NEXT: ; %bb.6: ; %Flow55 ; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: .LBB9_7: ; %frem.loop_exit +; SI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1 ; SI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -4944,7 +4944,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7 ; SI-NEXT: s_cbranch_vccz .LBB9_10 -; SI-NEXT: ; %bb.9: ; %frem.else20 +; SI-NEXT: ; %bb.9: ; %frem.else ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v8, s0, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -4956,7 +4956,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB9_10: ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB9_11: ; %frem.compute19 +; SI-NEXT: .LBB9_11: ; %frem.compute ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v6 @@ -4992,10 +4992,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB9_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB9_13: ; %frem.loop_body27 +; SI-NEXT: .LBB9_13: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v8, v6 ; SI-NEXT: v_mul_f32_e32 v6, v8, v7 @@ -5010,7 +5010,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB9_13 ; SI-NEXT: ; %bb.14: ; %Flow ; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; SI-NEXT: .LBB9_15: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1 ; SI-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -5084,7 +5084,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB9_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else20 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_bfi_b32 v7, s0, 0, v2 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -5093,7 +5093,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_8 ; CI-NEXT: .LBB9_2: ; CI-NEXT: ; implicit-def: $vgpr4 -; CI-NEXT: .LBB9_3: ; %frem.compute +; CI-NEXT: .LBB9_3: ; %frem.compute19 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; CI-NEXT: v_frexp_mant_f32_e32 v4, v6 ; CI-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -5118,10 +5118,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 ; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6 -; CI-NEXT: .LBB9_5: ; %frem.loop_body +; CI-NEXT: .LBB9_5: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -5136,7 +5136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB9_5 ; CI-NEXT: ; %bb.6: ; %Flow55 ; CI-NEXT: v_mov_b32_e32 v7, v9 -; CI-NEXT: .LBB9_7: ; %frem.loop_exit +; CI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6 ; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 ; CI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -5157,7 +5157,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 ; CI-NEXT: s_cbranch_vccz .LBB9_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v8, s0, 0, v0 @@ -5167,7 +5167,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB9_16 ; CI-NEXT: .LBB9_10: ; CI-NEXT: ; implicit-def: $vgpr5 -; CI-NEXT: .LBB9_11: ; %frem.compute19 +; CI-NEXT: .LBB9_11: ; %frem.compute ; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7 ; CI-NEXT: v_frexp_mant_f32_e32 v5, v7 ; CI-NEXT: v_frexp_mant_f32_e32 v7, v6 @@ -5192,10 +5192,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 ; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB9_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 ; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7 -; CI-NEXT: .LBB9_13: ; %frem.loop_body27 +; CI-NEXT: .LBB9_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v10, v8 ; CI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -5210,7 +5210,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB9_13 ; CI-NEXT: ; %bb.14: ; %Flow ; CI-NEXT: v_mov_b32_e32 v8, v10 -; CI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB9_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7 ; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 ; CI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -5275,7 +5275,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v3, |v1| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; VI-NEXT: s_cbranch_vccz .LBB9_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else20 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v2, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 @@ -5284,7 +5284,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_8 ; VI-NEXT: .LBB9_2: ; VI-NEXT: ; implicit-def: $vgpr2 -; VI-NEXT: .LBB9_3: ; %frem.compute +; VI-NEXT: .LBB9_3: ; %frem.compute19 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; VI-NEXT: v_frexp_mant_f32_e32 v2, v4 ; VI-NEXT: v_frexp_mant_f32_e32 v4, v3 @@ -5309,10 +5309,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 ; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8 ; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v4 -; VI-NEXT: .LBB9_5: ; %frem.loop_body +; VI-NEXT: .LBB9_5: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v7, v5 ; VI-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -5327,7 +5327,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB9_5 ; VI-NEXT: ; %bb.6: ; %Flow55 ; VI-NEXT: v_mov_b32_e32 v5, v7 -; VI-NEXT: .LBB9_7: ; %frem.loop_exit +; VI-NEXT: .LBB9_7: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4 ; VI-NEXT: v_ldexp_f32 v4, v5, v4 ; VI-NEXT: v_mul_f32_e32 v5, v4, v6 @@ -5347,7 +5347,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v6, |v4| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6 ; VI-NEXT: s_cbranch_vccz .LBB9_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v5, s2, 0, v3 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6 @@ -5356,7 +5356,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB9_16 ; VI-NEXT: .LBB9_10: ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: .LBB9_11: ; %frem.compute19 +; VI-NEXT: .LBB9_11: ; %frem.compute ; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7 ; VI-NEXT: v_frexp_mant_f32_e32 v5, v7 ; VI-NEXT: v_frexp_mant_f32_e32 v7, v6 @@ -5381,10 +5381,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7 ; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB9_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11 ; VI-NEXT: v_add_u32_e32 v7, vcc, 11, v7 -; VI-NEXT: .LBB9_13: ; %frem.loop_body27 +; VI-NEXT: .LBB9_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v10, v8 ; VI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -5399,7 +5399,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB9_13 ; VI-NEXT: ; %bb.14: ; %Flow ; VI-NEXT: v_mov_b32_e32 v8, v10 -; VI-NEXT: .LBB9_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB9_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v7, vcc, -10, v7 ; VI-NEXT: v_ldexp_f32 v7, v8, v7 ; VI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -5443,7 +5443,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3 ; GFX9-NEXT: s_cbranch_vccz .LBB9_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else20 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v2, s2, 0, v1 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3 @@ -5452,7 +5452,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB9_8 ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: .LBB9_3: ; %frem.compute +; GFX9-NEXT: .LBB9_3: ; %frem.compute19 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v4 ; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v3 @@ -5477,10 +5477,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v4, 11, v4 -; GFX9-NEXT: .LBB9_5: ; %frem.loop_body +; GFX9-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -5495,7 +5495,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB9_5 ; GFX9-NEXT: ; %bb.6: ; %Flow55 ; GFX9-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX9-NEXT: v_add_u32_e32 v4, -10, v4 ; GFX9-NEXT: v_ldexp_f32 v4, v5, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6 @@ -5514,7 +5514,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 ; GFX9-NEXT: s_cbranch_vccz .LBB9_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else20 +; GFX9-NEXT: ; %bb.9: ; %frem.else ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v3 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -5523,7 +5523,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB9_16 ; GFX9-NEXT: .LBB9_10: ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: .LBB9_11: ; %frem.compute19 +; GFX9-NEXT: .LBB9_11: ; %frem.compute ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -5548,10 +5548,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 ; GFX9-NEXT: v_add_u32_e32 v6, 11, v6 -; GFX9-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX9-NEXT: .LBB9_13: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -5566,7 +5566,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB9_13 ; GFX9-NEXT: ; %bb.14: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v6, -10, v6 ; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -5612,7 +5612,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v3, |v0| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: s_cbranch_vccz .LBB9_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else20 ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, 0, v1 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo @@ -5620,7 +5620,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB9_8 ; GFX10-NEXT: .LBB9_2: ; GFX10-NEXT: ; implicit-def: $vgpr2 -; GFX10-NEXT: .LBB9_3: ; %frem.compute +; GFX10-NEXT: .LBB9_3: ; %frem.compute19 ; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v3 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 @@ -5647,10 +5647,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB9_5: ; %frem.loop_body +; GFX10-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v7, v4 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -5666,7 +5666,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow55 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v4, v7 -; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX10-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX10-NEXT: v_ldexp_f32 v4, v4, v6 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5 @@ -5684,7 +5684,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v6, |v3| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v4 ; GFX10-NEXT: s_cbranch_vccz .LBB9_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else20 +; GFX10-NEXT: ; %bb.9: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, 0, v3 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc_lo @@ -5692,7 +5692,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB9_16 ; GFX10-NEXT: .LBB9_10: ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: .LBB9_11: ; %frem.compute19 +; GFX10-NEXT: .LBB9_11: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v6 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 ; GFX10-NEXT: v_ldexp_f32 v6, v5, 11 @@ -5719,10 +5719,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX10-NEXT: .LBB9_13: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -5738,7 +5738,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v6, v9 -; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -5782,7 +5782,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 @@ -5793,7 +5793,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB9_8 ; GFX11-TRUE16-NEXT: .LBB9_2: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2 -; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v3 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 @@ -5829,11 +5829,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v4 @@ -5853,7 +5853,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow55 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -5880,7 +5880,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10 -; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 @@ -5891,7 +5891,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB9_16 ; GFX11-TRUE16-NEXT: .LBB9_10: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v6 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) @@ -5927,11 +5927,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v7 @@ -5951,7 +5951,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, -10, v9 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v7, v9 @@ -6002,7 +6002,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6011,7 +6011,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB9_8 ; GFX11-FAKE16-NEXT: .LBB9_2: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, v4 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v3 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4 @@ -6047,11 +6047,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v4 @@ -6071,7 +6071,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow55 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -6097,7 +6097,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_10 -; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else ; GFX11-FAKE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -6106,7 +6106,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB9_16 ; GFX11-FAKE16-NEXT: .LBB9_10: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6 -; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v7 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) @@ -6142,11 +6142,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v7 @@ -6166,7 +6166,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, -10, v9 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v7, v9 @@ -6220,7 +6220,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB9_8 ; GFX1150-TRUE16-NEXT: .LBB9_2: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6267,11 +6267,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s5, s6, s5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s5, s5, 11 -; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -6293,7 +6293,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -6323,7 +6323,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 @@ -6335,7 +6335,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB9_16 ; GFX1150-TRUE16-NEXT: .LBB9_10: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -6370,11 +6370,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s7, s8, s7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s7, s7, 11 -; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -6396,7 +6396,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -6459,7 +6459,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -6469,7 +6469,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB9_8 ; GFX1150-FAKE16-NEXT: .LBB9_2: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6504,11 +6504,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s5, s6, s5 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s5, s5, 11 -; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -6530,7 +6530,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -6559,7 +6559,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s7 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -6569,7 +6569,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB9_16 ; GFX1150-FAKE16-NEXT: .LBB9_10: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -6604,11 +6604,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s7, s8, s7 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s7, s7, 11 -; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -6630,7 +6630,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -6690,7 +6690,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 @@ -6702,7 +6702,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB9_8 ; GFX1200-TRUE16-NEXT: .LBB9_2: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6737,11 +6737,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s5, s6, s5 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s5, s5, 11 -; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -6765,7 +6765,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -6799,7 +6799,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 @@ -6811,7 +6811,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB9_16 ; GFX1200-TRUE16-NEXT: .LBB9_10: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -6847,11 +6847,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s7, s8, s7 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s7, s7, 11 -; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -6875,7 +6875,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -6940,7 +6940,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2 -; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else20 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s6, s5 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -6950,7 +6950,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB9_8 ; GFX1200-FAKE16-NEXT: .LBB9_2: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute19 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -6986,11 +6986,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7 -; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s5, s6, s5 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s5, s5, 11 -; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -7014,7 +7014,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow55 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s5 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -7047,7 +7047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10 -; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s7 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -7058,7 +7058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB9_16 ; GFX1200-FAKE16-NEXT: .LBB9_10: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute19 +; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8 @@ -7094,11 +7094,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15 -; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s7, s8, s7 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s7, s7, 11 -; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27 +; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -7122,7 +7122,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s7 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28 +; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -7208,7 +7208,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: v_cvt_f16_f32_e32 v8, v6 ; SI-NEXT: s_cbranch_vccz .LBB10_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else86 ; SI-NEXT: v_bfi_b32 v11, s0, 0, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10 @@ -7219,7 +7219,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_2: ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_3: ; %frem.compute +; SI-NEXT: .LBB10_3: ; %frem.compute85 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9 @@ -7255,10 +7255,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_5: ; %frem.loop_body +; SI-NEXT: .LBB10_5: ; %frem.loop_body93 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v11, v9 ; SI-NEXT: v_mul_f32_e32 v9, v11, v10 @@ -7273,7 +7273,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_5 ; SI-NEXT: ; %bb.6: ; %Flow133 ; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: .LBB10_7: ; %frem.loop_exit +; SI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1 ; SI-NEXT: v_mul_f32_e32 v10, v9, v10 @@ -7294,7 +7294,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11 ; SI-NEXT: s_cbranch_vccz .LBB10_10 -; SI-NEXT: ; %bb.9: ; %frem.else20 +; SI-NEXT: ; %bb.9: ; %frem.else53 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v12, s0, 0, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 @@ -7306,7 +7306,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_10: ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_11: ; %frem.compute19 +; SI-NEXT: .LBB10_11: ; %frem.compute52 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10 @@ -7342,10 +7342,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_13: ; %frem.loop_body27 +; SI-NEXT: .LBB10_13: ; %frem.loop_body60 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v12, v10 ; SI-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -7360,7 +7360,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_13 ; SI-NEXT: ; %bb.14: ; %Flow129 ; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; SI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1 ; SI-NEXT: v_mul_f32_e32 v11, v10, v11 @@ -7381,7 +7381,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12 ; SI-NEXT: s_cbranch_vccz .LBB10_18 -; SI-NEXT: ; %bb.17: ; %frem.else53 +; SI-NEXT: ; %bb.17: ; %frem.else20 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v13, s0, 0, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 @@ -7393,7 +7393,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_18: ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_19: ; %frem.compute52 +; SI-NEXT: .LBB10_19: ; %frem.compute19 ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11 @@ -7429,10 +7429,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_23 -; SI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; SI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_21: ; %frem.loop_body60 +; SI-NEXT: .LBB10_21: ; %frem.loop_body27 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v13, v11 ; SI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -7447,7 +7447,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_21 ; SI-NEXT: ; %bb.22: ; %Flow125 ; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; SI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1 ; SI-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -7468,7 +7468,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13| ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13 ; SI-NEXT: s_cbranch_vccz .LBB10_26 -; SI-NEXT: ; %bb.25: ; %frem.else86 +; SI-NEXT: ; %bb.25: ; %frem.else ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v14, s0, 0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -7480,7 +7480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB10_26: ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB10_27: ; %frem.compute85 +; SI-NEXT: .LBB10_27: ; %frem.compute ; SI-NEXT: s_mov_b32 s3, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12 @@ -7516,10 +7516,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; SI-NEXT: s_cmp_lt_i32 s1, 12 ; SI-NEXT: s_cbranch_scc1 .LBB10_31 -; SI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s1, s2, s3 ; SI-NEXT: s_add_i32 s1, s1, 11 -; SI-NEXT: .LBB10_29: ; %frem.loop_body93 +; SI-NEXT: .LBB10_29: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v14, v12 ; SI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -7534,7 +7534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB10_29 ; SI-NEXT: ; %bb.30: ; %Flow ; SI-NEXT: v_mov_b32_e32 v12, v14 -; SI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; SI-NEXT: .LBB10_31: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s1, s1, -10 ; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1 ; SI-NEXT: v_mul_f32_e32 v13, v12, v13 @@ -7638,7 +7638,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7 ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB10_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else86 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; CI-NEXT: v_bfi_b32 v11, s0, 0, v6 ; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 @@ -7647,7 +7647,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_8 ; CI-NEXT: .LBB10_2: ; CI-NEXT: ; implicit-def: $vgpr8 -; CI-NEXT: .LBB10_3: ; %frem.compute +; CI-NEXT: .LBB10_3: ; %frem.compute85 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 ; CI-NEXT: v_frexp_mant_f32_e32 v8, v10 ; CI-NEXT: v_frexp_mant_f32_e32 v10, v9 @@ -7672,10 +7672,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 ; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 ; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10 -; CI-NEXT: .LBB10_5: ; %frem.loop_body +; CI-NEXT: .LBB10_5: ; %frem.loop_body93 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v13, v11 ; CI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -7690,7 +7690,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_5 ; CI-NEXT: ; %bb.6: ; %Flow133 ; CI-NEXT: v_mov_b32_e32 v11, v13 -; CI-NEXT: .LBB10_7: ; %frem.loop_exit +; CI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10 ; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 ; CI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -7711,7 +7711,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 ; CI-NEXT: s_cbranch_vccz .LBB10_10 -; CI-NEXT: ; %bb.9: ; %frem.else20 +; CI-NEXT: ; %bb.9: ; %frem.else53 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v12, s0, 0, v4 @@ -7721,7 +7721,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_16 ; CI-NEXT: .LBB10_10: ; CI-NEXT: ; implicit-def: $vgpr9 -; CI-NEXT: .LBB10_11: ; %frem.compute19 +; CI-NEXT: .LBB10_11: ; %frem.compute52 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 ; CI-NEXT: v_frexp_mant_f32_e32 v9, v11 ; CI-NEXT: v_frexp_mant_f32_e32 v11, v10 @@ -7746,10 +7746,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 ; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 ; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11 -; CI-NEXT: .LBB10_13: ; %frem.loop_body27 +; CI-NEXT: .LBB10_13: ; %frem.loop_body60 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v14, v12 ; CI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -7764,7 +7764,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_13 ; CI-NEXT: ; %bb.14: ; %Flow129 ; CI-NEXT: v_mov_b32_e32 v12, v14 -; CI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; CI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11 ; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 ; CI-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -7785,7 +7785,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11 ; CI-NEXT: s_cbranch_vccz .LBB10_18 -; CI-NEXT: ; %bb.17: ; %frem.else53 +; CI-NEXT: ; %bb.17: ; %frem.else20 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v13, s0, 0, v2 @@ -7795,7 +7795,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_24 ; CI-NEXT: .LBB10_18: ; CI-NEXT: ; implicit-def: $vgpr10 -; CI-NEXT: .LBB10_19: ; %frem.compute52 +; CI-NEXT: .LBB10_19: ; %frem.compute19 ; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12 ; CI-NEXT: v_frexp_mant_f32_e32 v10, v12 ; CI-NEXT: v_frexp_mant_f32_e32 v12, v11 @@ -7820,10 +7820,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12 ; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_23 -; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 ; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12 -; CI-NEXT: .LBB10_21: ; %frem.loop_body60 +; CI-NEXT: .LBB10_21: ; %frem.loop_body27 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v15, v13 ; CI-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -7838,7 +7838,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_21 ; CI-NEXT: ; %bb.22: ; %Flow125 ; CI-NEXT: v_mov_b32_e32 v13, v15 -; CI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; CI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12 ; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 ; CI-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -7859,7 +7859,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12| ; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 ; CI-NEXT: s_cbranch_vccz .LBB10_26 -; CI-NEXT: ; %bb.25: ; %frem.else86 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_brev_b32 s0, -2 ; CI-NEXT: v_bfi_b32 v14, s0, 0, v0 @@ -7869,7 +7869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB10_32 ; CI-NEXT: .LBB10_26: ; CI-NEXT: ; implicit-def: $vgpr11 -; CI-NEXT: .LBB10_27: ; %frem.compute85 +; CI-NEXT: .LBB10_27: ; %frem.compute ; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 ; CI-NEXT: v_frexp_mant_f32_e32 v11, v13 ; CI-NEXT: v_frexp_mant_f32_e32 v13, v12 @@ -7894,10 +7894,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 ; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB10_31 -; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 ; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13 -; CI-NEXT: .LBB10_29: ; %frem.loop_body93 +; CI-NEXT: .LBB10_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v16, v14 ; CI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -7912,7 +7912,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB10_29 ; CI-NEXT: ; %bb.30: ; %Flow ; CI-NEXT: v_mov_b32_e32 v14, v16 -; CI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; CI-NEXT: .LBB10_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13 ; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 ; CI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -8001,7 +8001,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v5, |v2| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 ; VI-NEXT: s_cbranch_vccz .LBB10_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else86 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -8010,7 +8010,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_8 ; VI-NEXT: .LBB10_2: ; VI-NEXT: ; implicit-def: $vgpr4 -; VI-NEXT: .LBB10_3: ; %frem.compute +; VI-NEXT: .LBB10_3: ; %frem.compute85 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; VI-NEXT: v_frexp_mant_f32_e32 v4, v6 ; VI-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -8035,10 +8035,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10 ; VI-NEXT: v_add_u32_e32 v6, vcc, 11, v6 -; VI-NEXT: .LBB10_5: ; %frem.loop_body +; VI-NEXT: .LBB10_5: ; %frem.loop_body93 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -8053,7 +8053,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_5 ; VI-NEXT: ; %bb.6: ; %Flow133 ; VI-NEXT: v_mov_b32_e32 v7, v9 -; VI-NEXT: .LBB10_7: ; %frem.loop_exit +; VI-NEXT: .LBB10_7: ; %frem.loop_exit94 ; VI-NEXT: v_add_u32_e32 v6, vcc, -10, v6 ; VI-NEXT: v_ldexp_f32 v6, v7, v6 ; VI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -8073,7 +8073,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v8, |v6| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 ; VI-NEXT: s_cbranch_vccz .LBB10_10 -; VI-NEXT: ; %bb.9: ; %frem.else20 +; VI-NEXT: ; %bb.9: ; %frem.else53 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v7, s2, 0, v5 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8 @@ -8082,7 +8082,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_16 ; VI-NEXT: .LBB10_10: ; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: .LBB10_11: ; %frem.compute19 +; VI-NEXT: .LBB10_11: ; %frem.compute52 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v12, v9 ; VI-NEXT: v_frexp_mant_f32_e32 v7, v9 ; VI-NEXT: v_frexp_mant_f32_e32 v9, v8 @@ -8107,10 +8107,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 ; VI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; VI-NEXT: v_sub_u32_e32 v9, vcc, v12, v13 ; VI-NEXT: v_add_u32_e32 v9, vcc, 11, v9 -; VI-NEXT: .LBB10_13: ; %frem.loop_body27 +; VI-NEXT: .LBB10_13: ; %frem.loop_body60 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v12, v10 ; VI-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -8125,7 +8125,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_13 ; VI-NEXT: ; %bb.14: ; %Flow129 ; VI-NEXT: v_mov_b32_e32 v10, v12 -; VI-NEXT: .LBB10_15: ; %frem.loop_exit28 +; VI-NEXT: .LBB10_15: ; %frem.loop_exit61 ; VI-NEXT: v_add_u32_e32 v9, vcc, -10, v9 ; VI-NEXT: v_ldexp_f32 v9, v10, v9 ; VI-NEXT: v_mul_f32_e32 v10, v9, v11 @@ -8143,7 +8143,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v9, |v3| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v9 ; VI-NEXT: s_cbranch_vccz .LBB10_18 -; VI-NEXT: ; %bb.17: ; %frem.else53 +; VI-NEXT: ; %bb.17: ; %frem.else20 ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v8, s2, 0, v1 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9 @@ -8152,7 +8152,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_24 ; VI-NEXT: .LBB10_18: ; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: .LBB10_19: ; %frem.compute52 +; VI-NEXT: .LBB10_19: ; %frem.compute19 ; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10 ; VI-NEXT: v_frexp_mant_f32_e32 v8, v10 ; VI-NEXT: v_frexp_mant_f32_e32 v10, v9 @@ -8177,10 +8177,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10 ; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_23 -; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14 ; VI-NEXT: v_add_u32_e32 v10, vcc, 11, v10 -; VI-NEXT: .LBB10_21: ; %frem.loop_body60 +; VI-NEXT: .LBB10_21: ; %frem.loop_body27 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v13, v11 ; VI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -8195,7 +8195,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_21 ; VI-NEXT: ; %bb.22: ; %Flow125 ; VI-NEXT: v_mov_b32_e32 v11, v13 -; VI-NEXT: .LBB10_23: ; %frem.loop_exit61 +; VI-NEXT: .LBB10_23: ; %frem.loop_exit28 ; VI-NEXT: v_add_u32_e32 v10, vcc, -10, v10 ; VI-NEXT: v_ldexp_f32 v10, v11, v10 ; VI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -8215,7 +8215,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cvt_f32_f16_e64 v12, |v10| ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12 ; VI-NEXT: s_cbranch_vccz .LBB10_26 -; VI-NEXT: ; %bb.25: ; %frem.else86 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_movk_i32 s2, 0x7fff ; VI-NEXT: v_bfi_b32 v11, s2, 0, v9 ; VI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12 @@ -8224,7 +8224,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB10_32 ; VI-NEXT: .LBB10_26: ; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB10_27: ; %frem.compute85 +; VI-NEXT: .LBB10_27: ; %frem.compute ; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13 ; VI-NEXT: v_frexp_mant_f32_e32 v11, v13 ; VI-NEXT: v_frexp_mant_f32_e32 v13, v12 @@ -8249,10 +8249,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13 ; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB10_31 -; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17 ; VI-NEXT: v_add_u32_e32 v13, vcc, 11, v13 -; VI-NEXT: .LBB10_29: ; %frem.loop_body93 +; VI-NEXT: .LBB10_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v16, v14 ; VI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -8267,7 +8267,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB10_29 ; VI-NEXT: ; %bb.30: ; %Flow ; VI-NEXT: v_mov_b32_e32 v14, v16 -; VI-NEXT: .LBB10_31: ; %frem.loop_exit94 +; VI-NEXT: .LBB10_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v13, vcc, -10, v13 ; VI-NEXT: v_ldexp_f32 v13, v14, v13 ; VI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -8320,7 +8320,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_e64 v5, |v0| ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5 ; GFX9-NEXT: s_cbranch_vccz .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else86 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v2 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5 @@ -8329,7 +8329,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_8 ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: .LBB10_3: ; %frem.compute +; GFX9-NEXT: .LBB10_3: ; %frem.compute85 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5 @@ -8354,10 +8354,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6 ; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 ; GFX9-NEXT: v_add_u32_e32 v6, 11, v6 -; GFX9-NEXT: .LBB10_5: ; %frem.loop_body +; GFX9-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -8372,7 +8372,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_5 ; GFX9-NEXT: ; %bb.6: ; %Flow133 ; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX9-NEXT: v_add_u32_e32 v6, -10, v6 ; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -8391,7 +8391,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7 ; GFX9-NEXT: s_cbranch_vccz .LBB10_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else20 +; GFX9-NEXT: ; %bb.9: ; %frem.else53 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v6, s2, 0, v5 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7 @@ -8400,7 +8400,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_16 ; GFX9-NEXT: .LBB10_10: ; GFX9-NEXT: ; implicit-def: $vgpr6 -; GFX9-NEXT: .LBB10_11: ; %frem.compute19 +; GFX9-NEXT: .LBB10_11: ; %frem.compute52 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v11, v8 ; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v8 ; GFX9-NEXT: v_frexp_mant_f32_e32 v8, v7 @@ -8425,10 +8425,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8 ; GFX9-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX9-NEXT: v_sub_u32_e32 v8, v11, v12 ; GFX9-NEXT: v_add_u32_e32 v8, 11, v8 -; GFX9-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX9-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v11, v9 ; GFX9-NEXT: v_mul_f32_e32 v9, v11, v10 @@ -8443,7 +8443,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_13 ; GFX9-NEXT: ; %bb.14: ; %Flow129 ; GFX9-NEXT: v_mov_b32_e32 v9, v11 -; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX9-NEXT: v_add_u32_e32 v8, -10, v8 ; GFX9-NEXT: v_ldexp_f32 v8, v9, v8 ; GFX9-NEXT: v_mul_f32_e32 v9, v8, v10 @@ -8461,7 +8461,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_e64 v8, |v1| ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8 ; GFX9-NEXT: s_cbranch_vccz .LBB10_18 -; GFX9-NEXT: ; %bb.17: ; %frem.else53 +; GFX9-NEXT: ; %bb.17: ; %frem.else20 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v7, s2, 0, v3 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8 @@ -8470,7 +8470,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_24 ; GFX9-NEXT: .LBB10_18: ; GFX9-NEXT: ; implicit-def: $vgpr7 -; GFX9-NEXT: .LBB10_19: ; %frem.compute52 +; GFX9-NEXT: .LBB10_19: ; %frem.compute19 ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v12, v9 ; GFX9-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v8 @@ -8495,10 +8495,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9 ; GFX9-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX9-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX9-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX9-NEXT: v_sub_u32_e32 v9, v12, v13 ; GFX9-NEXT: v_add_u32_e32 v9, 11, v9 -; GFX9-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX9-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -8513,7 +8513,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_21 ; GFX9-NEXT: ; %bb.22: ; %Flow125 ; GFX9-NEXT: v_mov_b32_e32 v10, v12 -; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX9-NEXT: v_add_u32_e32 v9, -10, v9 ; GFX9-NEXT: v_ldexp_f32 v9, v10, v9 ; GFX9-NEXT: v_mul_f32_e32 v10, v9, v11 @@ -8532,7 +8532,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10 ; GFX9-NEXT: s_cbranch_vccz .LBB10_26 -; GFX9-NEXT: ; %bb.25: ; %frem.else86 +; GFX9-NEXT: ; %bb.25: ; %frem.else ; GFX9-NEXT: s_movk_i32 s2, 0x7fff ; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v8 ; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10 @@ -8541,7 +8541,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB10_32 ; GFX9-NEXT: .LBB10_26: ; GFX9-NEXT: ; implicit-def: $vgpr9 -; GFX9-NEXT: .LBB10_27: ; %frem.compute85 +; GFX9-NEXT: .LBB10_27: ; %frem.compute ; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v11 ; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v11 ; GFX9-NEXT: v_frexp_mant_f32_e32 v11, v10 @@ -8566,10 +8566,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11 ; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX9-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15 ; GFX9-NEXT: v_add_u32_e32 v11, 11, v11 -; GFX9-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX9-NEXT: .LBB10_29: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -8584,7 +8584,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB10_29 ; GFX9-NEXT: ; %bb.30: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v12, v14 -; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v11, -10, v11 ; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -8640,7 +8640,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v5, |v0| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX10-NEXT: s_cbranch_vccz .LBB10_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else86 ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, 0, v2 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc_lo @@ -8648,7 +8648,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_8 ; GFX10-NEXT: .LBB10_2: ; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: .LBB10_3: ; %frem.compute +; GFX10-NEXT: .LBB10_3: ; %frem.compute85 ; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX10-NEXT: v_frexp_mant_f32_e32 v8, v5 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 @@ -8675,10 +8675,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_5: ; %frem.loop_body +; GFX10-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8694,7 +8694,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow133 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v6, v9 -; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -8712,7 +8712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v5| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v8, v7 ; GFX10-NEXT: s_cbranch_vccz .LBB10_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else20 +; GFX10-NEXT: ; %bb.9: ; %frem.else53 ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, 0, v5 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v8, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo @@ -8720,7 +8720,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_16 ; GFX10-NEXT: .LBB10_10: ; GFX10-NEXT: ; implicit-def: $vgpr6 -; GFX10-NEXT: .LBB10_11: ; %frem.compute19 +; GFX10-NEXT: .LBB10_11: ; %frem.compute52 ; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v8 ; GFX10-NEXT: v_frexp_mant_f32_e32 v10, v7 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v8 @@ -8747,10 +8747,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v10 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX10-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v11, v8 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8766,7 +8766,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow129 ; GFX10-NEXT: v_mov_b32_e32 v10, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, v11 -; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX10-NEXT: v_add_nc_u32_e32 v10, -10, v10 ; GFX10-NEXT: v_ldexp_f32 v8, v8, v10 ; GFX10-NEXT: v_mul_f32_e32 v9, v8, v9 @@ -8783,7 +8783,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v1| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX10-NEXT: s_cbranch_vccz .LBB10_18 -; GFX10-NEXT: ; %bb.17: ; %frem.else53 +; GFX10-NEXT: ; %bb.17: ; %frem.else20 ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, 0, v3 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc_lo @@ -8791,7 +8791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_24 ; GFX10-NEXT: .LBB10_18: ; GFX10-NEXT: ; implicit-def: $vgpr7 -; GFX10-NEXT: .LBB10_19: ; %frem.compute52 +; GFX10-NEXT: .LBB10_19: ; %frem.compute19 ; GFX10-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX10-NEXT: v_frexp_mant_f32_e32 v11, v8 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 @@ -8818,10 +8818,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11 ; GFX10-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX10-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX10-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX10-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8837,7 +8837,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.22: ; %Flow125 ; GFX10-NEXT: v_mov_b32_e32 v11, s2 ; GFX10-NEXT: v_mov_b32_e32 v9, v12 -; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX10-NEXT: v_add_nc_u32_e32 v11, -10, v11 ; GFX10-NEXT: v_ldexp_f32 v9, v9, v11 ; GFX10-NEXT: v_mul_f32_e32 v10, v9, v10 @@ -8855,7 +8855,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cvt_f32_f16_e64 v11, |v8| ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v11, v10 ; GFX10-NEXT: s_cbranch_vccz .LBB10_26 -; GFX10-NEXT: ; %bb.25: ; %frem.else86 +; GFX10-NEXT: ; %bb.25: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, 0, v8 ; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v11, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo @@ -8863,7 +8863,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB10_32 ; GFX10-NEXT: .LBB10_26: ; GFX10-NEXT: ; implicit-def: $vgpr9 -; GFX10-NEXT: .LBB10_27: ; %frem.compute85 +; GFX10-NEXT: .LBB10_27: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e32 v9, v11 ; GFX10-NEXT: v_frexp_mant_f32_e32 v13, v10 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v11 @@ -8890,10 +8890,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v13 ; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX10-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 11 -; GFX10-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX10-NEXT: .LBB10_29: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; GFX10-NEXT: s_add_i32 s2, s2, -11 @@ -8909,7 +8909,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.30: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v13, s2 ; GFX10-NEXT: v_mov_b32_e32 v11, v14 -; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v13, -10, v13 ; GFX10-NEXT: v_ldexp_f32 v11, v11, v13 ; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -8963,7 +8963,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 @@ -8974,7 +8974,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_8 ; GFX11-TRUE16-NEXT: .LBB10_2: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4 -; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v5 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 @@ -9010,11 +9010,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v6 @@ -9034,7 +9034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow133 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v9 -; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, v8 @@ -9061,7 +9061,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10 -; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 @@ -9072,7 +9072,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_16 ; GFX11-TRUE16-NEXT: .LBB10_10: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v8 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 @@ -9108,11 +9108,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v9 @@ -9132,7 +9132,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow129 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v12 -; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, -10, v11 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v9, v11 @@ -9156,7 +9156,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18 -; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 @@ -9167,7 +9167,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_24 ; GFX11-TRUE16-NEXT: .LBB10_18: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8 -; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v10 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v12, v9 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10 @@ -9203,11 +9203,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v10 @@ -9227,7 +9227,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.22: ; %Flow125 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v13 -; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, -10, v12 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v10, v12 @@ -9254,7 +9254,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26 -; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 @@ -9265,7 +9265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_branch .LBB10_32 ; GFX11-TRUE16-NEXT: .LBB10_26: ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v13 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v15, v12 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13 @@ -9301,11 +9301,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, v13 @@ -9325,7 +9325,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: ; %bb.30: ; %Flow ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v16 -; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, -10, v15 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v13, v15 @@ -9388,7 +9388,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_2 -; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9397,7 +9397,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_8 ; GFX11-FAKE16-NEXT: .LBB10_2: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4 -; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, v6 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v5 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6 @@ -9433,11 +9433,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v6 @@ -9457,7 +9457,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow133 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v9 -; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, -10, v8 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, v8 @@ -9483,7 +9483,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_10 -; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9492,7 +9492,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_16 ; GFX11-FAKE16-NEXT: .LBB10_10: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7 -; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v7, v9 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v8 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9 @@ -9528,11 +9528,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v9 @@ -9552,7 +9552,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow129 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v12 -; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, -10, v11 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v9, v11 @@ -9575,7 +9575,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_18 -; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX11-FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9584,7 +9584,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_24 ; GFX11-FAKE16-NEXT: .LBB10_18: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8 -; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v10 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v12, v9 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10 @@ -9620,11 +9620,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v10 @@ -9644,7 +9644,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.22: ; %Flow125 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v13 -; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, -10, v12 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v10, v12 @@ -9670,7 +9670,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_26 -; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else ; GFX11-FAKE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9 ; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -9679,7 +9679,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_branch .LBB10_32 ; GFX11-FAKE16-NEXT: .LBB10_26: ; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11 -; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v13 ; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v15, v12 ; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13 @@ -9715,11 +9715,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-FAKE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11 -; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v13 @@ -9739,7 +9739,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: ; %bb.30: ; %Flow ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v16 -; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, -10, v15 ; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v13, v15 @@ -9804,7 +9804,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 @@ -9816,7 +9816,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_8 ; GFX1150-TRUE16-NEXT: .LBB10_2: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -9851,11 +9851,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s6, s8, s6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s6, s6, 11 -; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -9877,7 +9877,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -9907,7 +9907,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -9919,7 +9919,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_16 ; GFX1150-TRUE16-NEXT: .LBB10_10: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -9954,11 +9954,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -9980,7 +9980,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -10008,7 +10008,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -10020,7 +10020,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_24 ; GFX1150-TRUE16-NEXT: .LBB10_18: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2 -; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -10055,11 +10055,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, v4 @@ -10081,7 +10081,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -10111,7 +10111,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 ; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 @@ -10123,7 +10123,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_branch .LBB10_32 ; GFX1150-TRUE16-NEXT: .LBB10_26: ; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3 -; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -10158,11 +10158,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1150-TRUE16-NEXT: s_sub_i32 s11, s12, s11 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: s_add_i32 s11, s11, 11 -; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v8, v5 @@ -10184,7 +10184,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: ; %bb.30: ; %Flow ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -10265,7 +10265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10275,7 +10275,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_8 ; GFX1150-FAKE16-NEXT: .LBB10_2: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -10310,11 +10310,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s6, s8, s6 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s6, s6, 11 -; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -10336,7 +10336,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -10365,7 +10365,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10375,7 +10375,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_16 ; GFX1150-FAKE16-NEXT: .LBB10_10: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -10410,11 +10410,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -10436,7 +10436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -10463,7 +10463,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10473,7 +10473,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_24 ; GFX1150-FAKE16-NEXT: .LBB10_18: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -10508,11 +10508,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11 -; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, v4 @@ -10534,7 +10534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -10563,7 +10563,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else ; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s12, s11 ; GFX1150-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 ; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -10573,7 +10573,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: s_branch .LBB10_32 ; GFX1150-FAKE16-NEXT: .LBB10_26: ; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr3 -; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -10608,11 +10608,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1150-FAKE16-NEXT: s_sub_i32 s11, s12, s11 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: s_add_i32 s11, s11, 11 -; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v8, v5 @@ -10634,7 +10634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-FAKE16-NEXT: ; %bb.30: ; %Flow ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -10712,7 +10712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 @@ -10724,7 +10724,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_8 ; GFX1200-TRUE16-NEXT: .LBB10_2: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -10759,11 +10759,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s6, s8, s6 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s6, s6, 11 -; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2 @@ -10787,7 +10787,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -10821,7 +10821,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -10833,7 +10833,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_16 ; GFX1200-TRUE16-NEXT: .LBB10_10: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -10869,11 +10869,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3 @@ -10897,7 +10897,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -10928,7 +10928,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 @@ -10941,7 +10941,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_24 ; GFX1200-TRUE16-NEXT: .LBB10_18: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2 -; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -10977,11 +10977,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, v4 @@ -11005,7 +11005,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -11039,7 +11039,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 ; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 @@ -11051,7 +11051,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_branch .LBB10_32 ; GFX1200-TRUE16-NEXT: .LBB10_26: ; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3 -; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -11087,11 +11087,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1200-TRUE16-NEXT: s_sub_co_i32 s11, s12, s11 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX1200-TRUE16-NEXT: s_add_co_i32 s11, s11, 11 -; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v8, v5 @@ -11115,7 +11115,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: ; %bb.30: ; %Flow ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -11203,7 +11203,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2 -; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else86 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s6 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11213,7 +11213,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_8 ; GFX1200-FAKE16-NEXT: .LBB10_2: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute +; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute85 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -11249,11 +11249,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7 -; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s6, s8, s6 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s6, s6, 11 -; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body +; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2 @@ -11277,7 +11277,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow133 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s6 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit +; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4 @@ -11310,7 +11310,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10 -; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else53 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11321,7 +11321,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_16 ; GFX1200-FAKE16-NEXT: .LBB10_10: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1 -; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute19 +; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute52 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -11357,11 +11357,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15 -; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader +; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27 +; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3 @@ -11385,7 +11385,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow129 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s9 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28 +; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5 @@ -11415,7 +11415,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18 -; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else20 ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11426,7 +11426,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_24 ; GFX1200-FAKE16-NEXT: .LBB10_18: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr2 -; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute52 +; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute19 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10 @@ -11462,11 +11462,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23 -; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader +; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11 -; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60 +; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27 ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, v4 @@ -11490,7 +11490,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.22: ; %Flow125 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, s9 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61 +; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6 @@ -11523,7 +11523,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2) ; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26 -; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else ; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s12, s11 ; GFX1200-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 ; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -11534,7 +11534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: s_branch .LBB10_32 ; GFX1200-FAKE16-NEXT: .LBB10_26: ; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr3 -; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute85 +; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11 ; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12 ; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12 @@ -11570,11 +11570,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7 ; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31 -; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader +; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1200-FAKE16-NEXT: s_sub_co_i32 s11, s12, s11 ; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe ; GFX1200-FAKE16-NEXT: s_add_co_i32 s11, s11, 11 -; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93 +; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body ; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v8, v5 @@ -11598,7 +11598,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-FAKE16-NEXT: ; %bb.30: ; %Flow ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, s11 ; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94 +; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7 ; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7 @@ -11686,7 +11686,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB11_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else16 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -11697,7 +11697,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB11_2: ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB11_3: ; %frem.compute +; SI-NEXT: .LBB11_3: ; %frem.compute15 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 @@ -11733,10 +11733,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB11_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB11_5: ; %frem.loop_body +; SI-NEXT: .LBB11_5: ; %frem.loop_body23 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v7, v5 ; SI-NEXT: v_mul_f32_e32 v5, v7, v6 @@ -11751,7 +11751,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB11_5 ; SI-NEXT: ; %bb.6: ; %Flow51 ; SI-NEXT: v_mov_b32_e32 v5, v7 -; SI-NEXT: .LBB11_7: ; %frem.loop_exit +; SI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v5, v5, s3 ; SI-NEXT: v_mul_f32_e32 v6, v5, v6 @@ -11767,7 +11767,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB11_10 -; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: ; %bb.9: ; %frem.else ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v5, s2, 0, v1 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -11778,7 +11778,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB11_10: ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB11_11: ; %frem.compute15 +; SI-NEXT: .LBB11_11: ; %frem.compute ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1 @@ -11814,10 +11814,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB11_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB11_13: ; %frem.loop_body23 +; SI-NEXT: .LBB11_13: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v8, v6 ; SI-NEXT: v_mul_f32_e32 v6, v8, v7 @@ -11832,7 +11832,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB11_13 ; SI-NEXT: ; %bb.14: ; %Flow ; SI-NEXT: v_mov_b32_e32 v6, v8 -; SI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; SI-NEXT: .LBB11_15: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v6, v6, s3 ; SI-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -11877,7 +11877,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB11_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -11886,7 +11886,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_8 ; CI-NEXT: .LBB11_2: ; CI-NEXT: ; implicit-def: $vgpr4 -; CI-NEXT: .LBB11_3: ; %frem.compute +; CI-NEXT: .LBB11_3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; CI-NEXT: v_ldexp_f32_e64 v5, v5, 1 ; CI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 @@ -11911,10 +11911,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 ; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10 ; CI-NEXT: v_add_i32_e32 v6, vcc, 12, v6 -; CI-NEXT: .LBB11_5: ; %frem.loop_body +; CI-NEXT: .LBB11_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v9, v7 ; CI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -11929,7 +11929,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB11_5 ; CI-NEXT: ; %bb.6: ; %Flow51 ; CI-NEXT: v_mov_b32_e32 v7, v9 -; CI-NEXT: .LBB11_7: ; %frem.loop_exit +; CI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6 ; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6 ; CI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -11945,7 +11945,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB11_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v5, s2, 0, v1 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -11954,7 +11954,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB11_16 ; CI-NEXT: .LBB11_10: ; CI-NEXT: ; implicit-def: $vgpr5 -; CI-NEXT: .LBB11_11: ; %frem.compute15 +; CI-NEXT: .LBB11_11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1 ; CI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 @@ -11979,10 +11979,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 ; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB11_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11 ; CI-NEXT: v_add_i32_e32 v7, vcc, 12, v7 -; CI-NEXT: .LBB11_13: ; %frem.loop_body23 +; CI-NEXT: .LBB11_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v10, v8 ; CI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -11997,7 +11997,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB11_13 ; CI-NEXT: ; %bb.14: ; %Flow ; CI-NEXT: v_mov_b32_e32 v8, v10 -; CI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB11_15: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7 ; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7 ; CI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -12042,7 +12042,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB11_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v4, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -12051,7 +12051,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_8 ; VI-NEXT: .LBB11_2: ; VI-NEXT: ; implicit-def: $vgpr4 -; VI-NEXT: .LBB11_3: ; %frem.compute +; VI-NEXT: .LBB11_3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; VI-NEXT: v_ldexp_f32 v5, v5, 1 ; VI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 @@ -12076,10 +12076,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 ; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10 ; VI-NEXT: v_add_u32_e32 v6, vcc, 12, v6 -; VI-NEXT: .LBB11_5: ; %frem.loop_body +; VI-NEXT: .LBB11_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v9, v7 ; VI-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -12094,7 +12094,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB11_5 ; VI-NEXT: ; %bb.6: ; %Flow51 ; VI-NEXT: v_mov_b32_e32 v7, v9 -; VI-NEXT: .LBB11_7: ; %frem.loop_exit +; VI-NEXT: .LBB11_7: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v6, vcc, -11, v6 ; VI-NEXT: v_ldexp_f32 v6, v7, v6 ; VI-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -12110,7 +12110,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB11_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v5, s2, 0, v1 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -12119,7 +12119,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB11_16 ; VI-NEXT: .LBB11_10: ; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: .LBB11_11: ; %frem.compute15 +; VI-NEXT: .LBB11_11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; VI-NEXT: v_ldexp_f32 v6, v6, 1 ; VI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 @@ -12144,10 +12144,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 ; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB11_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11 ; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v7 -; VI-NEXT: .LBB11_13: ; %frem.loop_body23 +; VI-NEXT: .LBB11_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v10, v8 ; VI-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -12162,7 +12162,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB11_13 ; VI-NEXT: ; %bb.14: ; %Flow ; VI-NEXT: v_mov_b32_e32 v8, v10 -; VI-NEXT: .LBB11_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB11_15: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v7, vcc, -11, v7 ; VI-NEXT: v_ldexp_f32 v7, v8, v7 ; VI-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -12202,7 +12202,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else16 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v0 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2| @@ -12211,7 +12211,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB11_8 ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: ; implicit-def: $vgpr4 -; GFX9-NEXT: .LBB11_3: ; %frem.compute +; GFX9-NEXT: .LBB11_3: ; %frem.compute15 ; GFX9-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; GFX9-NEXT: v_ldexp_f32 v5, v5, 1 ; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0 @@ -12236,10 +12236,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6 ; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10 ; GFX9-NEXT: v_add_u32_e32 v6, 12, v6 -; GFX9-NEXT: .LBB11_5: ; %frem.loop_body +; GFX9-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v9, v7 ; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8 @@ -12254,7 +12254,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB11_5 ; GFX9-NEXT: ; %bb.6: ; %Flow51 ; GFX9-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX9-NEXT: v_add_u32_e32 v6, -11, v6 ; GFX9-NEXT: v_ldexp_f32 v6, v7, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8 @@ -12270,7 +12270,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB11_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: ; %bb.9: ; %frem.else ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v5, s2, 0, v1 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3| @@ -12279,7 +12279,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB11_16 ; GFX9-NEXT: .LBB11_10: ; GFX9-NEXT: ; implicit-def: $vgpr5 -; GFX9-NEXT: .LBB11_11: ; %frem.compute15 +; GFX9-NEXT: .LBB11_11: ; %frem.compute ; GFX9-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; GFX9-NEXT: v_ldexp_f32 v6, v6, 1 ; GFX9-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0 @@ -12304,10 +12304,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v7, v10, v11 ; GFX9-NEXT: v_add_u32_e32 v7, 12, v7 -; GFX9-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX9-NEXT: .LBB11_13: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v10, v8 ; GFX9-NEXT: v_mul_f32_e32 v8, v10, v9 @@ -12322,7 +12322,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB11_13 ; GFX9-NEXT: ; %bb.14: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v8, v10 -; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v7, -11, v7 ; GFX9-NEXT: v_ldexp_f32 v7, v8, v7 ; GFX9-NEXT: v_mul_f32_e32 v8, v7, v9 @@ -12363,7 +12363,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB11_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else16 ; GFX10-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo @@ -12371,7 +12371,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB11_8 ; GFX10-NEXT: .LBB11_2: ; GFX10-NEXT: ; implicit-def: $vgpr4 -; GFX10-NEXT: .LBB11_3: ; %frem.compute +; GFX10-NEXT: .LBB11_3: ; %frem.compute15 ; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; GFX10-NEXT: v_frexp_mant_f32_e64 v4, |v0| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 @@ -12398,10 +12398,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB11_5: ; %frem.loop_body +; GFX10-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -12417,7 +12417,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow51 ; GFX10-NEXT: v_mov_b32_e32 v8, s2 ; GFX10-NEXT: v_mov_b32_e32 v6, v9 -; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX10-NEXT: v_add_nc_u32_e32 v8, -11, v8 ; GFX10-NEXT: v_ldexp_f32 v6, v6, v8 ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7 @@ -12432,7 +12432,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB11_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: ; %bb.9: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3| ; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc_lo @@ -12440,7 +12440,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB11_16 ; GFX10-NEXT: .LBB11_10: ; GFX10-NEXT: ; implicit-def: $vgpr5 -; GFX10-NEXT: .LBB11_11: ; %frem.compute15 +; GFX10-NEXT: .LBB11_11: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v1| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 @@ -12467,10 +12467,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9 ; GFX10-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX10-NEXT: .LBB11_13: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -12486,7 +12486,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v9, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, v10 -; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v9, -11, v9 ; GFX10-NEXT: v_ldexp_f32 v7, v7, v9 ; GFX10-NEXT: v_mul_f32_e32 v8, v7, v8 @@ -12524,7 +12524,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: ; %bb.1: ; %frem.else16 ; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12533,7 +12533,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB11_8 ; GFX11-NEXT: .LBB11_2: ; GFX11-NEXT: ; implicit-def: $vgpr4 -; GFX11-NEXT: .LBB11_3: ; %frem.compute +; GFX11-NEXT: .LBB11_3: ; %frem.compute15 ; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v2| ; GFX11-NEXT: v_frexp_mant_f32_e64 v4, |v0| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v7, v0 @@ -12569,11 +12569,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB11_5: ; %frem.loop_body +; GFX11-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v9, v6 @@ -12593,7 +12593,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.6: ; %Flow51 ; GFX11-NEXT: v_mov_b32_e32 v8, s2 ; GFX11-NEXT: v_mov_b32_e32 v6, v9 -; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v8, -11, v8 ; GFX11-NEXT: v_ldexp_f32 v6, v6, v8 @@ -12613,7 +12613,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB11_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: ; %bb.9: ; %frem.else ; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -12622,7 +12622,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB11_16 ; GFX11-NEXT: .LBB11_10: ; GFX11-NEXT: ; implicit-def: $vgpr5 -; GFX11-NEXT: .LBB11_11: ; %frem.compute15 +; GFX11-NEXT: .LBB11_11: ; %frem.compute ; GFX11-NEXT: v_frexp_mant_f32_e64 v6, |v3| ; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v1| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v8, v1 @@ -12658,11 +12658,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX11-NEXT: .LBB11_13: ; %frem.loop_body ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v10, v7 @@ -12682,7 +12682,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.14: ; %Flow ; GFX11-NEXT: v_mov_b32_e32 v9, s2 ; GFX11-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v9, -11, v9 ; GFX11-NEXT: v_ldexp_f32 v7, v7, v9 @@ -12730,7 +12730,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8 ; GFX1150-NEXT: s_cbranch_scc0 .LBB11_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: ; %bb.1: ; %frem.else16 ; GFX1150-NEXT: s_cmp_eq_f32 s3, s8 ; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -12740,7 +12740,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB11_8 ; GFX1150-NEXT: .LBB11_2: ; GFX1150-NEXT: ; implicit-def: $vgpr0 -; GFX1150-NEXT: .LBB11_3: ; %frem.compute +; GFX1150-NEXT: .LBB11_3: ; %frem.compute15 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s6| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -12775,11 +12775,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1150-NEXT: s_sub_i32 s7, s7, s8 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s7, s7, 12 -; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body +; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v5, v2 @@ -12801,7 +12801,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.6: ; %Flow51 ; GFX1150-NEXT: v_mov_b32_e32 v4, s7 ; GFX1150-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4 @@ -12824,7 +12824,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s6, s8 ; GFX1150-NEXT: s_cbranch_scc0 .LBB11_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: ; %bb.9: ; %frem.else ; GFX1150-NEXT: s_cmp_eq_f32 s6, s8 ; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -12834,7 +12834,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB11_16 ; GFX1150-NEXT: .LBB11_10: ; GFX1150-NEXT: ; implicit-def: $vgpr1 -; GFX1150-NEXT: .LBB11_11: ; %frem.compute15 +; GFX1150-NEXT: .LBB11_11: ; %frem.compute ; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s2| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s5| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 @@ -12869,11 +12869,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-NEXT: s_sub_i32 s7, s7, s8 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s7, s7, 12 -; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v6, v3 @@ -12895,7 +12895,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.14: ; %Flow ; GFX1150-NEXT: v_mov_b32_e32 v5, s7 ; GFX1150-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5 @@ -12950,7 +12950,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8 ; GFX1200-NEXT: s_cbranch_scc0 .LBB11_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: ; %bb.1: ; %frem.else16 ; GFX1200-NEXT: s_cmp_eq_f32 s3, s8 ; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -12960,7 +12960,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB11_8 ; GFX1200-NEXT: .LBB11_2: ; GFX1200-NEXT: ; implicit-def: $vgpr0 -; GFX1200-NEXT: .LBB11_3: ; %frem.compute +; GFX1200-NEXT: .LBB11_3: ; %frem.compute15 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s6| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 @@ -12996,11 +12996,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB11_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 -; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body +; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v5, v2 @@ -13024,7 +13024,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.6: ; %Flow51 ; GFX1200-NEXT: v_mov_b32_e32 v4, s7 ; GFX1200-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit +; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 @@ -13048,7 +13048,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s6, s8 ; GFX1200-NEXT: s_cbranch_scc0 .LBB11_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: ; %bb.9: ; %frem.else ; GFX1200-NEXT: s_cmp_eq_f32 s6, s8 ; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -13059,7 +13059,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB11_16 ; GFX1200-NEXT: .LBB11_10: ; GFX1200-NEXT: ; implicit-def: $vgpr1 -; GFX1200-NEXT: .LBB11_11: ; %frem.compute15 +; GFX1200-NEXT: .LBB11_11: ; %frem.compute ; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s2| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s5| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s5 @@ -13095,11 +13095,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB11_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s7, s7, 12 -; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body23 +; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v6, v3 @@ -13123,7 +13123,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.14: ; %Flow ; GFX1200-NEXT: v_mov_b32_e32 v5, s7 ; GFX1200-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 @@ -13187,7 +13187,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else78 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v8, s2, 0, v0 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -13198,7 +13198,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_2: ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_3: ; %frem.compute +; SI-NEXT: .LBB12_3: ; %frem.compute77 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v0 @@ -13234,10 +13234,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_5: ; %frem.loop_body +; SI-NEXT: .LBB12_5: ; %frem.loop_body85 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v11, v9 ; SI-NEXT: v_mul_f32_e32 v9, v11, v10 @@ -13252,7 +13252,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_5 ; SI-NEXT: ; %bb.6: ; %Flow125 ; SI-NEXT: v_mov_b32_e32 v9, v11 -; SI-NEXT: .LBB12_7: ; %frem.loop_exit +; SI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v9, v9, s3 ; SI-NEXT: v_mul_f32_e32 v10, v9, v10 @@ -13268,7 +13268,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_10 -; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: ; %bb.9: ; %frem.else47 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v9, s2, 0, v1 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -13279,7 +13279,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_10: ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_11: ; %frem.compute15 +; SI-NEXT: .LBB12_11: ; %frem.compute46 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v1 @@ -13315,10 +13315,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_13: ; %frem.loop_body23 +; SI-NEXT: .LBB12_13: ; %frem.loop_body54 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v12, v10 ; SI-NEXT: v_mul_f32_e32 v10, v12, v11 @@ -13333,7 +13333,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_13 ; SI-NEXT: ; %bb.14: ; %Flow121 ; SI-NEXT: v_mov_b32_e32 v10, v12 -; SI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; SI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v10, v10, s3 ; SI-NEXT: v_mul_f32_e32 v11, v10, v11 @@ -13349,7 +13349,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_18 -; SI-NEXT: ; %bb.17: ; %frem.else47 +; SI-NEXT: ; %bb.17: ; %frem.else16 ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v10, s2, 0, v2 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -13360,7 +13360,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_18: ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_19: ; %frem.compute46 +; SI-NEXT: .LBB12_19: ; %frem.compute15 ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2 @@ -13396,10 +13396,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_23 -; SI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; SI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_21: ; %frem.loop_body54 +; SI-NEXT: .LBB12_21: ; %frem.loop_body23 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v13, v11 ; SI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -13414,7 +13414,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_21 ; SI-NEXT: ; %bb.22: ; %Flow117 ; SI-NEXT: v_mov_b32_e32 v11, v13 -; SI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; SI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v11, v11, s3 ; SI-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -13430,7 +13430,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_cbranch_vccz .LBB12_26 -; SI-NEXT: ; %bb.25: ; %frem.else78 +; SI-NEXT: ; %bb.25: ; %frem.else ; SI-NEXT: s_brev_b32 s2, -2 ; SI-NEXT: v_bfi_b32 v11, s2, 0, v3 ; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -13441,7 +13441,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB12_26: ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB12_27: ; %frem.compute77 +; SI-NEXT: .LBB12_27: ; %frem.compute ; SI-NEXT: s_mov_b32 s6, 0x7f800000 ; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6 ; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3 @@ -13477,10 +13477,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; SI-NEXT: s_cmp_lt_i32 s3, 13 ; SI-NEXT: s_cbranch_scc1 .LBB12_31 -; SI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s3, s4, s5 ; SI-NEXT: s_add_i32 s3, s3, 12 -; SI-NEXT: .LBB12_29: ; %frem.loop_body85 +; SI-NEXT: .LBB12_29: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v14, v12 ; SI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -13495,7 +13495,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: s_cbranch_scc1 .LBB12_29 ; SI-NEXT: ; %bb.30: ; %Flow ; SI-NEXT: v_mov_b32_e32 v12, v14 -; SI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; SI-NEXT: .LBB12_31: ; %frem.loop_exit ; SI-NEXT: s_add_i32 s3, s3, -11 ; SI-NEXT: v_ldexp_f32_e64 v12, v12, s3 ; SI-NEXT: v_mul_f32_e32 v13, v12, v13 @@ -13548,7 +13548,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else78 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v8, s2, 0, v0 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -13557,7 +13557,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_8 ; CI-NEXT: .LBB12_2: ; CI-NEXT: ; implicit-def: $vgpr8 -; CI-NEXT: .LBB12_3: ; %frem.compute +; CI-NEXT: .LBB12_3: ; %frem.compute77 ; CI-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; CI-NEXT: v_ldexp_f32_e64 v9, v9, 1 ; CI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 @@ -13582,10 +13582,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 ; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14 ; CI-NEXT: v_add_i32_e32 v10, vcc, 12, v10 -; CI-NEXT: .LBB12_5: ; %frem.loop_body +; CI-NEXT: .LBB12_5: ; %frem.loop_body85 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v13, v11 ; CI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -13600,7 +13600,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_5 ; CI-NEXT: ; %bb.6: ; %Flow125 ; CI-NEXT: v_mov_b32_e32 v11, v13 -; CI-NEXT: .LBB12_7: ; %frem.loop_exit +; CI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10 ; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10 ; CI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -13616,7 +13616,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else47 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v9, s2, 0, v1 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -13625,7 +13625,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_16 ; CI-NEXT: .LBB12_10: ; CI-NEXT: ; implicit-def: $vgpr9 -; CI-NEXT: .LBB12_11: ; %frem.compute15 +; CI-NEXT: .LBB12_11: ; %frem.compute46 ; CI-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; CI-NEXT: v_ldexp_f32_e64 v10, v10, 1 ; CI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 @@ -13650,10 +13650,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 ; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15 ; CI-NEXT: v_add_i32_e32 v11, vcc, 12, v11 -; CI-NEXT: .LBB12_13: ; %frem.loop_body23 +; CI-NEXT: .LBB12_13: ; %frem.loop_body54 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v14, v12 ; CI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -13668,7 +13668,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_13 ; CI-NEXT: ; %bb.14: ; %Flow121 ; CI-NEXT: v_mov_b32_e32 v12, v14 -; CI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11 ; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11 ; CI-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -13684,7 +13684,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_18 -; CI-NEXT: ; %bb.17: ; %frem.else47 +; CI-NEXT: ; %bb.17: ; %frem.else16 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v10, s2, 0, v2 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -13693,7 +13693,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_24 ; CI-NEXT: .LBB12_18: ; CI-NEXT: ; implicit-def: $vgpr10 -; CI-NEXT: .LBB12_19: ; %frem.compute46 +; CI-NEXT: .LBB12_19: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; CI-NEXT: v_ldexp_f32_e64 v11, v11, 1 ; CI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 @@ -13718,10 +13718,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 ; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_23 -; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16 ; CI-NEXT: v_add_i32_e32 v12, vcc, 12, v12 -; CI-NEXT: .LBB12_21: ; %frem.loop_body54 +; CI-NEXT: .LBB12_21: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v15, v13 ; CI-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -13736,7 +13736,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_21 ; CI-NEXT: ; %bb.22: ; %Flow117 ; CI-NEXT: v_mov_b32_e32 v13, v15 -; CI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; CI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12 ; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12 ; CI-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -13752,7 +13752,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB12_26 -; CI-NEXT: ; %bb.25: ; %frem.else78 +; CI-NEXT: ; %bb.25: ; %frem.else ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_bfi_b32 v11, s2, 0, v3 ; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -13761,7 +13761,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB12_32 ; CI-NEXT: .LBB12_26: ; CI-NEXT: ; implicit-def: $vgpr11 -; CI-NEXT: .LBB12_27: ; %frem.compute77 +; CI-NEXT: .LBB12_27: ; %frem.compute ; CI-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; CI-NEXT: v_ldexp_f32_e64 v12, v12, 1 ; CI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 @@ -13786,10 +13786,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 ; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB12_31 -; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17 ; CI-NEXT: v_add_i32_e32 v13, vcc, 12, v13 -; CI-NEXT: .LBB12_29: ; %frem.loop_body85 +; CI-NEXT: .LBB12_29: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v16, v14 ; CI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -13804,7 +13804,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_cbranch_vccnz .LBB12_29 ; CI-NEXT: ; %bb.30: ; %Flow ; CI-NEXT: v_mov_b32_e32 v14, v16 -; CI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; CI-NEXT: .LBB12_31: ; %frem.loop_exit ; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13 ; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13 ; CI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -13857,7 +13857,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else78 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v8, s2, 0, v0 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -13866,7 +13866,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_8 ; VI-NEXT: .LBB12_2: ; VI-NEXT: ; implicit-def: $vgpr8 -; VI-NEXT: .LBB12_3: ; %frem.compute +; VI-NEXT: .LBB12_3: ; %frem.compute77 ; VI-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; VI-NEXT: v_ldexp_f32 v9, v9, 1 ; VI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 @@ -13891,10 +13891,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 ; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14 ; VI-NEXT: v_add_u32_e32 v10, vcc, 12, v10 -; VI-NEXT: .LBB12_5: ; %frem.loop_body +; VI-NEXT: .LBB12_5: ; %frem.loop_body85 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v13, v11 ; VI-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -13909,7 +13909,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_5 ; VI-NEXT: ; %bb.6: ; %Flow125 ; VI-NEXT: v_mov_b32_e32 v11, v13 -; VI-NEXT: .LBB12_7: ; %frem.loop_exit +; VI-NEXT: .LBB12_7: ; %frem.loop_exit86 ; VI-NEXT: v_add_u32_e32 v10, vcc, -11, v10 ; VI-NEXT: v_ldexp_f32 v10, v11, v10 ; VI-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -13925,7 +13925,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else47 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v9, s2, 0, v1 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -13934,7 +13934,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_16 ; VI-NEXT: .LBB12_10: ; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: .LBB12_11: ; %frem.compute15 +; VI-NEXT: .LBB12_11: ; %frem.compute46 ; VI-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; VI-NEXT: v_ldexp_f32 v10, v10, 1 ; VI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 @@ -13959,10 +13959,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 ; VI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; VI-NEXT: v_sub_u32_e32 v11, vcc, v14, v15 ; VI-NEXT: v_add_u32_e32 v11, vcc, 12, v11 -; VI-NEXT: .LBB12_13: ; %frem.loop_body23 +; VI-NEXT: .LBB12_13: ; %frem.loop_body54 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v14, v12 ; VI-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -13977,7 +13977,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_13 ; VI-NEXT: ; %bb.14: ; %Flow121 ; VI-NEXT: v_mov_b32_e32 v12, v14 -; VI-NEXT: .LBB12_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB12_15: ; %frem.loop_exit55 ; VI-NEXT: v_add_u32_e32 v11, vcc, -11, v11 ; VI-NEXT: v_ldexp_f32 v11, v12, v11 ; VI-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -13993,7 +13993,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_18 -; VI-NEXT: ; %bb.17: ; %frem.else47 +; VI-NEXT: ; %bb.17: ; %frem.else16 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v10, s2, 0, v2 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -14002,7 +14002,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_24 ; VI-NEXT: .LBB12_18: ; VI-NEXT: ; implicit-def: $vgpr10 -; VI-NEXT: .LBB12_19: ; %frem.compute46 +; VI-NEXT: .LBB12_19: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; VI-NEXT: v_ldexp_f32 v11, v11, 1 ; VI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 @@ -14027,10 +14027,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 ; VI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_23 -; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; VI-NEXT: v_sub_u32_e32 v12, vcc, v15, v16 ; VI-NEXT: v_add_u32_e32 v12, vcc, 12, v12 -; VI-NEXT: .LBB12_21: ; %frem.loop_body54 +; VI-NEXT: .LBB12_21: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v15, v13 ; VI-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -14045,7 +14045,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_21 ; VI-NEXT: ; %bb.22: ; %Flow117 ; VI-NEXT: v_mov_b32_e32 v13, v15 -; VI-NEXT: .LBB12_23: ; %frem.loop_exit55 +; VI-NEXT: .LBB12_23: ; %frem.loop_exit24 ; VI-NEXT: v_add_u32_e32 v12, vcc, -11, v12 ; VI-NEXT: v_ldexp_f32 v12, v13, v12 ; VI-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -14061,7 +14061,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB12_26 -; VI-NEXT: ; %bb.25: ; %frem.else78 +; VI-NEXT: ; %bb.25: ; %frem.else ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_bfi_b32 v11, s2, 0, v3 ; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -14070,7 +14070,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB12_32 ; VI-NEXT: .LBB12_26: ; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: .LBB12_27: ; %frem.compute77 +; VI-NEXT: .LBB12_27: ; %frem.compute ; VI-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; VI-NEXT: v_ldexp_f32 v12, v12, 1 ; VI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 @@ -14095,10 +14095,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 ; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB12_31 -; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17 ; VI-NEXT: v_add_u32_e32 v13, vcc, 12, v13 -; VI-NEXT: .LBB12_29: ; %frem.loop_body85 +; VI-NEXT: .LBB12_29: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v16, v14 ; VI-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -14113,7 +14113,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_cbranch_vccnz .LBB12_29 ; VI-NEXT: ; %bb.30: ; %Flow ; VI-NEXT: v_mov_b32_e32 v14, v16 -; VI-NEXT: .LBB12_31: ; %frem.loop_exit86 +; VI-NEXT: .LBB12_31: ; %frem.loop_exit ; VI-NEXT: v_add_u32_e32 v13, vcc, -11, v13 ; VI-NEXT: v_ldexp_f32 v13, v14, v13 ; VI-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -14161,7 +14161,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else78 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v8, s2, 0, v0 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4| @@ -14170,7 +14170,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_8 ; GFX9-NEXT: .LBB12_2: ; GFX9-NEXT: ; implicit-def: $vgpr8 -; GFX9-NEXT: .LBB12_3: ; %frem.compute +; GFX9-NEXT: .LBB12_3: ; %frem.compute77 ; GFX9-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; GFX9-NEXT: v_ldexp_f32 v9, v9, 1 ; GFX9-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0 @@ -14195,10 +14195,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10 ; GFX9-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX9-NEXT: v_sub_u32_e32 v10, v13, v14 ; GFX9-NEXT: v_add_u32_e32 v10, 12, v10 -; GFX9-NEXT: .LBB12_5: ; %frem.loop_body +; GFX9-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-NEXT: v_mul_f32_e32 v11, v13, v12 @@ -14213,7 +14213,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_5 ; GFX9-NEXT: ; %bb.6: ; %Flow125 ; GFX9-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX9-NEXT: v_add_u32_e32 v10, -11, v10 ; GFX9-NEXT: v_ldexp_f32 v10, v11, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v10, v12 @@ -14229,7 +14229,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: ; %bb.9: ; %frem.else47 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v1 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5| @@ -14238,7 +14238,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_16 ; GFX9-NEXT: .LBB12_10: ; GFX9-NEXT: ; implicit-def: $vgpr9 -; GFX9-NEXT: .LBB12_11: ; %frem.compute15 +; GFX9-NEXT: .LBB12_11: ; %frem.compute46 ; GFX9-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; GFX9-NEXT: v_ldexp_f32 v10, v10, 1 ; GFX9-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0 @@ -14263,10 +14263,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11 ; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15 ; GFX9-NEXT: v_add_u32_e32 v11, 12, v11 -; GFX9-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX9-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v14, v12 ; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13 @@ -14281,7 +14281,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_13 ; GFX9-NEXT: ; %bb.14: ; %Flow121 ; GFX9-NEXT: v_mov_b32_e32 v12, v14 -; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX9-NEXT: v_add_u32_e32 v11, -11, v11 ; GFX9-NEXT: v_ldexp_f32 v11, v12, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13 @@ -14297,7 +14297,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_18 -; GFX9-NEXT: ; %bb.17: ; %frem.else47 +; GFX9-NEXT: ; %bb.17: ; %frem.else16 ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v10, s2, 0, v2 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6| @@ -14306,7 +14306,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_24 ; GFX9-NEXT: .LBB12_18: ; GFX9-NEXT: ; implicit-def: $vgpr10 -; GFX9-NEXT: .LBB12_19: ; %frem.compute46 +; GFX9-NEXT: .LBB12_19: ; %frem.compute15 ; GFX9-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; GFX9-NEXT: v_ldexp_f32 v11, v11, 1 ; GFX9-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0 @@ -14331,10 +14331,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12 ; GFX9-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX9-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX9-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX9-NEXT: v_sub_u32_e32 v12, v15, v16 ; GFX9-NEXT: v_add_u32_e32 v12, 12, v12 -; GFX9-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX9-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v15, v13 ; GFX9-NEXT: v_mul_f32_e32 v13, v15, v14 @@ -14349,7 +14349,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_21 ; GFX9-NEXT: ; %bb.22: ; %Flow117 ; GFX9-NEXT: v_mov_b32_e32 v13, v15 -; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX9-NEXT: v_add_u32_e32 v12, -11, v12 ; GFX9-NEXT: v_ldexp_f32 v12, v13, v12 ; GFX9-NEXT: v_mul_f32_e32 v13, v12, v14 @@ -14365,7 +14365,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB12_26 -; GFX9-NEXT: ; %bb.25: ; %frem.else78 +; GFX9-NEXT: ; %bb.25: ; %frem.else ; GFX9-NEXT: s_brev_b32 s2, -2 ; GFX9-NEXT: v_bfi_b32 v11, s2, 0, v3 ; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7| @@ -14374,7 +14374,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB12_32 ; GFX9-NEXT: .LBB12_26: ; GFX9-NEXT: ; implicit-def: $vgpr11 -; GFX9-NEXT: .LBB12_27: ; %frem.compute77 +; GFX9-NEXT: .LBB12_27: ; %frem.compute ; GFX9-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; GFX9-NEXT: v_ldexp_f32 v12, v12, 1 ; GFX9-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0 @@ -14399,10 +14399,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13 ; GFX9-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX9-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v13, v16, v17 ; GFX9-NEXT: v_add_u32_e32 v13, 12, v13 -; GFX9-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX9-NEXT: .LBB12_29: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v16, v14 ; GFX9-NEXT: v_mul_f32_e32 v14, v16, v15 @@ -14417,7 +14417,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_cbranch_vccnz .LBB12_29 ; GFX9-NEXT: ; %bb.30: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v14, v16 -; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX9-NEXT: v_add_u32_e32 v13, -11, v13 ; GFX9-NEXT: v_ldexp_f32 v13, v14, v13 ; GFX9-NEXT: v_mul_f32_e32 v14, v13, v15 @@ -14466,7 +14466,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else78 ; GFX10-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc_lo @@ -14474,7 +14474,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_8 ; GFX10-NEXT: .LBB12_2: ; GFX10-NEXT: ; implicit-def: $vgpr8 -; GFX10-NEXT: .LBB12_3: ; %frem.compute +; GFX10-NEXT: .LBB12_3: ; %frem.compute77 ; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; GFX10-NEXT: v_frexp_mant_f32_e64 v8, |v0| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v11, v0 @@ -14501,10 +14501,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v12 ; GFX10-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_5: ; %frem.loop_body +; GFX10-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v13, v10 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14520,7 +14520,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.6: ; %Flow125 ; GFX10-NEXT: v_mov_b32_e32 v12, s2 ; GFX10-NEXT: v_mov_b32_e32 v10, v13 -; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX10-NEXT: v_add_nc_u32_e32 v12, -11, v12 ; GFX10-NEXT: v_ldexp_f32 v10, v10, v12 ; GFX10-NEXT: v_mul_f32_e32 v11, v10, v11 @@ -14535,7 +14535,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: ; %bb.9: ; %frem.else47 ; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5| ; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc_lo @@ -14543,7 +14543,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_16 ; GFX10-NEXT: .LBB12_10: ; GFX10-NEXT: ; implicit-def: $vgpr9 -; GFX10-NEXT: .LBB12_11: ; %frem.compute15 +; GFX10-NEXT: .LBB12_11: ; %frem.compute46 ; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v1| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v1 @@ -14570,10 +14570,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v13 ; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX10-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14589,7 +14589,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.14: ; %Flow121 ; GFX10-NEXT: v_mov_b32_e32 v13, s2 ; GFX10-NEXT: v_mov_b32_e32 v11, v14 -; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX10-NEXT: v_add_nc_u32_e32 v13, -11, v13 ; GFX10-NEXT: v_ldexp_f32 v11, v11, v13 ; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12 @@ -14604,7 +14604,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_18 -; GFX10-NEXT: ; %bb.17: ; %frem.else47 +; GFX10-NEXT: ; %bb.17: ; %frem.else16 ; GFX10-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6| ; GFX10-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc_lo @@ -14612,7 +14612,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_24 ; GFX10-NEXT: .LBB12_18: ; GFX10-NEXT: ; implicit-def: $vgpr10 -; GFX10-NEXT: .LBB12_19: ; %frem.compute46 +; GFX10-NEXT: .LBB12_19: ; %frem.compute15 ; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v2| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v13, v2 @@ -14639,10 +14639,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v14 ; GFX10-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX10-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX10-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX10-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v15, v12 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14658,7 +14658,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.22: ; %Flow117 ; GFX10-NEXT: v_mov_b32_e32 v14, s2 ; GFX10-NEXT: v_mov_b32_e32 v12, v15 -; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX10-NEXT: v_add_nc_u32_e32 v14, -11, v14 ; GFX10-NEXT: v_ldexp_f32 v12, v12, v14 ; GFX10-NEXT: v_mul_f32_e32 v13, v12, v13 @@ -14673,7 +14673,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB12_26 -; GFX10-NEXT: ; %bb.25: ; %frem.else78 +; GFX10-NEXT: ; %bb.25: ; %frem.else ; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3 ; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7| ; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc_lo @@ -14681,7 +14681,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB12_32 ; GFX10-NEXT: .LBB12_26: ; GFX10-NEXT: ; implicit-def: $vgpr11 -; GFX10-NEXT: .LBB12_27: ; %frem.compute77 +; GFX10-NEXT: .LBB12_27: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v3| ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v14, v3 @@ -14708,10 +14708,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v15 ; GFX10-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX10-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 12 -; GFX10-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX10-NEXT: .LBB12_29: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v16, v13 ; GFX10-NEXT: s_add_i32 s2, s2, -12 @@ -14727,7 +14727,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: ; %bb.30: ; %Flow ; GFX10-NEXT: v_mov_b32_e32 v15, s2 ; GFX10-NEXT: v_mov_b32_e32 v13, v16 -; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX10-NEXT: v_add_nc_u32_e32 v15, -11, v15 ; GFX10-NEXT: v_ldexp_f32 v13, v13, v15 ; GFX10-NEXT: v_mul_f32_e32 v14, v13, v14 @@ -14773,7 +14773,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: ; %bb.1: ; %frem.else78 ; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14782,7 +14782,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_8 ; GFX11-NEXT: .LBB12_2: ; GFX11-NEXT: ; implicit-def: $vgpr8 -; GFX11-NEXT: .LBB12_3: ; %frem.compute +; GFX11-NEXT: .LBB12_3: ; %frem.compute77 ; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v4| ; GFX11-NEXT: v_frexp_mant_f32_e64 v8, |v0| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v11, v0 @@ -14818,11 +14818,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_5: ; %frem.loop_body +; GFX11-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v13, v10 @@ -14842,7 +14842,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.6: ; %Flow125 ; GFX11-NEXT: v_mov_b32_e32 v12, s2 ; GFX11-NEXT: v_mov_b32_e32 v10, v13 -; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v12, -11, v12 ; GFX11-NEXT: v_ldexp_f32 v10, v10, v12 @@ -14862,7 +14862,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: ; %bb.9: ; %frem.else47 ; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14871,7 +14871,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_16 ; GFX11-NEXT: .LBB12_10: ; GFX11-NEXT: ; implicit-def: $vgpr9 -; GFX11-NEXT: .LBB12_11: ; %frem.compute15 +; GFX11-NEXT: .LBB12_11: ; %frem.compute46 ; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v5| ; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v1| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v12, v1 @@ -14907,11 +14907,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX11-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v14, v11 @@ -14931,7 +14931,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.14: ; %Flow121 ; GFX11-NEXT: v_mov_b32_e32 v13, s2 ; GFX11-NEXT: v_mov_b32_e32 v11, v14 -; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v13, -11, v13 ; GFX11-NEXT: v_ldexp_f32 v11, v11, v13 @@ -14951,7 +14951,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_18 -; GFX11-NEXT: ; %bb.17: ; %frem.else47 +; GFX11-NEXT: ; %bb.17: ; %frem.else16 ; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -14960,7 +14960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_24 ; GFX11-NEXT: .LBB12_18: ; GFX11-NEXT: ; implicit-def: $vgpr10 -; GFX11-NEXT: .LBB12_19: ; %frem.compute46 +; GFX11-NEXT: .LBB12_19: ; %frem.compute15 ; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v6| ; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v2| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v13, v2 @@ -14996,11 +14996,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX11-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX11-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX11-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v15, v12 @@ -15020,7 +15020,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.22: ; %Flow117 ; GFX11-NEXT: v_mov_b32_e32 v14, s2 ; GFX11-NEXT: v_mov_b32_e32 v12, v15 -; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v14, -11, v14 ; GFX11-NEXT: v_ldexp_f32 v12, v12, v14 @@ -15040,7 +15040,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB12_26 -; GFX11-NEXT: ; %bb.25: ; %frem.else78 +; GFX11-NEXT: ; %bb.25: ; %frem.else ; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3 ; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) @@ -15049,7 +15049,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB12_32 ; GFX11-NEXT: .LBB12_26: ; GFX11-NEXT: ; implicit-def: $vgpr11 -; GFX11-NEXT: .LBB12_27: ; %frem.compute77 +; GFX11-NEXT: .LBB12_27: ; %frem.compute ; GFX11-NEXT: v_frexp_mant_f32_e64 v12, |v7| ; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v3| ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v14, v3 @@ -15085,11 +15085,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX11-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX11-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 12 -; GFX11-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX11-NEXT: .LBB12_29: ; %frem.loop_body ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v16, v13 @@ -15109,7 +15109,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.30: ; %Flow ; GFX11-NEXT: v_mov_b32_e32 v15, s2 ; GFX11-NEXT: v_mov_b32_e32 v13, v16 -; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e32 v15, -11, v15 ; GFX11-NEXT: v_ldexp_f32 v13, v13, v15 @@ -15170,7 +15170,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s5, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: ; %bb.1: ; %frem.else78 ; GFX1150-NEXT: s_cmp_eq_f32 s5, s12 ; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15180,7 +15180,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_8 ; GFX1150-NEXT: .LBB12_2: ; GFX1150-NEXT: ; implicit-def: $vgpr0 -; GFX1150-NEXT: .LBB12_3: ; %frem.compute +; GFX1150-NEXT: .LBB12_3: ; %frem.compute77 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s6| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s8| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -15215,11 +15215,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body +; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v5, v2 @@ -15241,7 +15241,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.6: ; %Flow125 ; GFX1150-NEXT: v_mov_b32_e32 v4, s11 ; GFX1150-NEXT: v_mov_b32_e32 v2, v5 -; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4 @@ -15264,7 +15264,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s8, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: ; %bb.9: ; %frem.else47 ; GFX1150-NEXT: s_cmp_eq_f32 s8, s12 ; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15274,7 +15274,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_16 ; GFX1150-NEXT: .LBB12_10: ; GFX1150-NEXT: ; implicit-def: $vgpr1 -; GFX1150-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1150-NEXT: .LBB12_11: ; %frem.compute46 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s4| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s10| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -15309,11 +15309,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v6, v3 @@ -15335,7 +15335,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.14: ; %Flow121 ; GFX1150-NEXT: v_mov_b32_e32 v5, s11 ; GFX1150-NEXT: v_mov_b32_e32 v3, v6 -; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5 @@ -15358,7 +15358,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s10, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_18 -; GFX1150-NEXT: ; %bb.17: ; %frem.else47 +; GFX1150-NEXT: ; %bb.17: ; %frem.else16 ; GFX1150-NEXT: s_cmp_eq_f32 s10, s12 ; GFX1150-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15368,7 +15368,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_24 ; GFX1150-NEXT: .LBB12_18: ; GFX1150-NEXT: ; implicit-def: $vgpr2 -; GFX1150-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1150-NEXT: .LBB12_19: ; %frem.compute15 ; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s3| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 @@ -15403,11 +15403,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 ; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v7, v4 @@ -15429,7 +15429,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.22: ; %Flow117 ; GFX1150-NEXT: v_mov_b32_e32 v6, s11 ; GFX1150-NEXT: v_mov_b32_e32 v4, v7 -; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v6, -11, v6 ; GFX1150-NEXT: v_ldexp_f32 v4, v4, v6 @@ -15452,7 +15452,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_cmp_ngt_f32 s9, s12 ; GFX1150-NEXT: s_cbranch_scc0 .LBB12_26 -; GFX1150-NEXT: ; %bb.25: ; %frem.else78 +; GFX1150-NEXT: ; %bb.25: ; %frem.else ; GFX1150-NEXT: s_cmp_eq_f32 s9, s12 ; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 ; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15462,7 +15462,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB12_32 ; GFX1150-NEXT: .LBB12_26: ; GFX1150-NEXT: ; implicit-def: $vgpr3 -; GFX1150-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1150-NEXT: .LBB12_27: ; %frem.compute ; GFX1150-NEXT: v_frexp_mant_f32_e64 v4, |s2| ; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s7| ; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 @@ -15497,11 +15497,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 ; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1150-NEXT: s_sub_i32 s11, s11, s12 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s11, s11, 12 -; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_mov_b32_e32 v8, v5 @@ -15523,7 +15523,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.30: ; %Flow ; GFX1150-NEXT: v_mov_b32_e32 v7, s11 ; GFX1150-NEXT: v_mov_b32_e32 v5, v8 -; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_add_nc_u32_e32 v7, -11, v7 ; GFX1150-NEXT: v_ldexp_f32 v5, v5, v7 @@ -15597,7 +15597,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_cmp_ngt_f32 s5, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: ; %bb.1: ; %frem.else78 ; GFX1200-NEXT: s_cmp_eq_f32 s5, s12 ; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15607,7 +15607,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_8 ; GFX1200-NEXT: .LBB12_2: ; GFX1200-NEXT: ; implicit-def: $vgpr0 -; GFX1200-NEXT: .LBB12_3: ; %frem.compute +; GFX1200-NEXT: .LBB12_3: ; %frem.compute77 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s6| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s8| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s8 @@ -15643,11 +15643,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4 ; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body85.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body +; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body85 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: v_mov_b32_e32 v5, v2 @@ -15670,7 +15670,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.6: ; %Flow125 ; GFX1200-NEXT: v_mov_b32_e32 v4, s11 ; GFX1200-NEXT: v_mov_b32_e32 v2, v5 -; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit +; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit86 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4 ; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4 @@ -15694,7 +15694,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s8, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: ; %bb.9: ; %frem.else47 ; GFX1200-NEXT: s_cmp_eq_f32 s8, s12 ; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15705,7 +15705,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_16 ; GFX1200-NEXT: .LBB12_10: ; GFX1200-NEXT: ; implicit-def: $vgpr1 -; GFX1200-NEXT: .LBB12_11: ; %frem.compute15 +; GFX1200-NEXT: .LBB12_11: ; %frem.compute46 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s4| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s10| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s10 @@ -15741,11 +15741,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5 ; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body54.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23 +; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body54 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v6, v3 @@ -15769,7 +15769,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.14: ; %Flow121 ; GFX1200-NEXT: v_mov_b32_e32 v5, s11 ; GFX1200-NEXT: v_mov_b32_e32 v3, v6 -; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit55 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5 ; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5 @@ -15793,7 +15793,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s10, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18 -; GFX1200-NEXT: ; %bb.17: ; %frem.else47 +; GFX1200-NEXT: ; %bb.17: ; %frem.else16 ; GFX1200-NEXT: s_cmp_eq_f32 s10, s12 ; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15804,7 +15804,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_24 ; GFX1200-NEXT: .LBB12_18: ; GFX1200-NEXT: ; implicit-def: $vgpr2 -; GFX1200-NEXT: .LBB12_19: ; %frem.compute46 +; GFX1200-NEXT: .LBB12_19: ; %frem.compute15 ; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s3| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s9| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, s9 @@ -15840,11 +15840,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6 ; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23 -; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader +; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body23.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54 +; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v7, v4 @@ -15868,7 +15868,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.22: ; %Flow117 ; GFX1200-NEXT: v_mov_b32_e32 v6, s11 ; GFX1200-NEXT: v_mov_b32_e32 v4, v7 -; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55 +; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v6, -11, v6 ; GFX1200-NEXT: v_ldexp_f32 v4, v4, v6 @@ -15892,7 +15892,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cmp_ngt_f32 s9, s12 ; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26 -; GFX1200-NEXT: ; %bb.25: ; %frem.else78 +; GFX1200-NEXT: ; %bb.25: ; %frem.else ; GFX1200-NEXT: s_cmp_eq_f32 s9, s12 ; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7 ; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0 @@ -15903,7 +15903,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB12_32 ; GFX1200-NEXT: .LBB12_26: ; GFX1200-NEXT: ; implicit-def: $vgpr3 -; GFX1200-NEXT: .LBB12_27: ; %frem.compute77 +; GFX1200-NEXT: .LBB12_27: ; %frem.compute ; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s2| ; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s7| ; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s7 @@ -15939,11 +15939,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7 ; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31 -; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader +; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body.preheader ; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s11, s11, 12 -; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85 +; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-NEXT: v_mov_b32_e32 v8, v5 @@ -15967,7 +15967,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.30: ; %Flow ; GFX1200-NEXT: v_mov_b32_e32 v7, s11 ; GFX1200-NEXT: v_mov_b32_e32 v5, v8 -; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86 +; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7 ; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7 @@ -16048,7 +16048,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| ; SI-NEXT: s_and_b64 vcc, exec, s[0:1] ; SI-NEXT: s_cbranch_vccz .LBB13_2 -; SI-NEXT: ; %bb.1: ; %frem.else +; SI-NEXT: ; %bb.1: ; %frem.else16 ; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; SI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16059,7 +16059,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB13_2: ; SI-NEXT: ; implicit-def: $vgpr8_vgpr9 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB13_3: ; %frem.compute +; SI-NEXT: .LBB13_3: ; %frem.compute15 ; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v1 ; SI-NEXT: s_mov_b32 s0, 0 @@ -16105,13 +16105,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; SI-NEXT: s_cmp_lt_i32 s6, 27 ; SI-NEXT: s_cbranch_scc1 .LBB13_7 -; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; SI-NEXT: s_sub_i32 s0, s3, s7 ; SI-NEXT: s_add_i32 s6, s0, 26 ; SI-NEXT: s_mov_b32 s3, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v18, 0x43300000 ; SI-NEXT: v_mov_b32_e32 v14, 0 -; SI-NEXT: .LBB13_5: ; %frem.loop_body +; SI-NEXT: .LBB13_5: ; %frem.loop_body23 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v17, v11 ; SI-NEXT: v_mov_b32_e32 v16, v10 @@ -16134,7 +16134,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: ; %bb.6: ; %Flow51 ; SI-NEXT: v_mov_b32_e32 v10, v16 ; SI-NEXT: v_mov_b32_e32 v11, v17 -; SI-NEXT: .LBB13_7: ; %frem.loop_exit +; SI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; SI-NEXT: s_sub_i32 s0, s6, 25 ; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], s0 ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] @@ -16160,7 +16160,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]| ; SI-NEXT: s_and_b64 vcc, exec, s[0:1] ; SI-NEXT: s_cbranch_vccz .LBB13_10 -; SI-NEXT: ; %bb.9: ; %frem.else16 +; SI-NEXT: ; %bb.9: ; %frem.else ; SI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; SI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16171,7 +16171,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: .LBB13_10: ; SI-NEXT: ; implicit-def: $vgpr10_vgpr11 ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: .LBB13_11: ; %frem.compute15 +; SI-NEXT: .LBB13_11: ; %frem.compute ; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: v_and_b32_e32 v12, 0x7fffffff, v3 ; SI-NEXT: s_mov_b32 s0, 0 @@ -16217,13 +16217,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; SI-NEXT: s_cmp_lt_i32 s6, 27 ; SI-NEXT: s_cbranch_scc1 .LBB13_15 -; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; SI-NEXT: s_sub_i32 s0, s3, s7 ; SI-NEXT: s_add_i32 s6, s0, 26 ; SI-NEXT: s_mov_b32 s3, 0x432fffff ; SI-NEXT: v_mov_b32_e32 v20, 0x43300000 ; SI-NEXT: v_mov_b32_e32 v16, 0 -; SI-NEXT: .LBB13_13: ; %frem.loop_body23 +; SI-NEXT: .LBB13_13: ; %frem.loop_body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v19, v13 ; SI-NEXT: v_mov_b32_e32 v18, v12 @@ -16246,7 +16246,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: ; %bb.14: ; %Flow ; SI-NEXT: v_mov_b32_e32 v12, v18 ; SI-NEXT: v_mov_b32_e32 v13, v19 -; SI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; SI-NEXT: .LBB13_15: ; %frem.loop_exit ; SI-NEXT: s_sub_i32 s0, s6, 25 ; SI-NEXT: v_ldexp_f64 v[12:13], v[12:13], s0 ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] @@ -16304,7 +16304,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB13_2 -; CI-NEXT: ; %bb.1: ; %frem.else +; CI-NEXT: ; %bb.1: ; %frem.else16 ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; CI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16313,7 +16313,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB13_8 ; CI-NEXT: .LBB13_2: ; CI-NEXT: ; implicit-def: $vgpr8_vgpr9 -; CI-NEXT: .LBB13_3: ; %frem.compute +; CI-NEXT: .LBB13_3: ; %frem.compute15 ; CI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; CI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] ; CI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] @@ -16337,10 +16337,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 ; CI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_7 -; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; CI-NEXT: v_sub_i32_e32 v14, vcc, v14, v15 ; CI-NEXT: v_add_i32_e32 v17, vcc, 26, v14 -; CI-NEXT: .LBB13_5: ; %frem.loop_body +; CI-NEXT: .LBB13_5: ; %frem.loop_body23 ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v15, v11 ; CI-NEXT: v_mov_b32_e32 v14, v10 @@ -16358,7 +16358,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; %bb.6: ; %Flow51 ; CI-NEXT: v_mov_b32_e32 v10, v14 ; CI-NEXT: v_mov_b32_e32 v11, v15 -; CI-NEXT: .LBB13_7: ; %frem.loop_exit +; CI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; CI-NEXT: v_subrev_i32_e32 v14, vcc, 25, v17 ; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; CI-NEXT: s_brev_b32 s2, -2 @@ -16375,7 +16375,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| ; CI-NEXT: s_and_b64 vcc, exec, s[2:3] ; CI-NEXT: s_cbranch_vccz .LBB13_10 -; CI-NEXT: ; %bb.9: ; %frem.else16 +; CI-NEXT: ; %bb.9: ; %frem.else ; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; CI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; CI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16384,7 +16384,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: s_branch .LBB13_16 ; CI-NEXT: .LBB13_10: ; CI-NEXT: ; implicit-def: $vgpr10_vgpr11 -; CI-NEXT: .LBB13_11: ; %frem.compute15 +; CI-NEXT: .LBB13_11: ; %frem.compute ; CI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; CI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] ; CI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] @@ -16408,10 +16408,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 ; CI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; CI-NEXT: s_cbranch_vccnz .LBB13_15 -; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; CI-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 ; CI-NEXT: v_add_i32_e32 v19, vcc, 26, v16 -; CI-NEXT: .LBB13_13: ; %frem.loop_body23 +; CI-NEXT: .LBB13_13: ; %frem.loop_body ; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: v_mov_b32_e32 v17, v13 ; CI-NEXT: v_mov_b32_e32 v16, v12 @@ -16429,7 +16429,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: ; %bb.14: ; %Flow ; CI-NEXT: v_mov_b32_e32 v12, v16 ; CI-NEXT: v_mov_b32_e32 v13, v17 -; CI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; CI-NEXT: .LBB13_15: ; %frem.loop_exit ; CI-NEXT: v_subrev_i32_e32 v16, vcc, 25, v19 ; CI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; CI-NEXT: s_brev_b32 s2, -2 @@ -16478,7 +16478,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB13_2 -; VI-NEXT: ; %bb.1: ; %frem.else +; VI-NEXT: ; %bb.1: ; %frem.else16 ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; VI-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; VI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16487,7 +16487,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB13_8 ; VI-NEXT: .LBB13_2: ; VI-NEXT: ; implicit-def: $vgpr8_vgpr9 -; VI-NEXT: .LBB13_3: ; %frem.compute +; VI-NEXT: .LBB13_3: ; %frem.compute15 ; VI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; VI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] ; VI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] @@ -16511,10 +16511,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 ; VI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_7 -; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; VI-NEXT: v_sub_u32_e32 v14, vcc, v14, v15 ; VI-NEXT: v_add_u32_e32 v17, vcc, 26, v14 -; VI-NEXT: .LBB13_5: ; %frem.loop_body +; VI-NEXT: .LBB13_5: ; %frem.loop_body23 ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v15, v11 ; VI-NEXT: v_mov_b32_e32 v14, v10 @@ -16532,7 +16532,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; %bb.6: ; %Flow51 ; VI-NEXT: v_mov_b32_e32 v10, v14 ; VI-NEXT: v_mov_b32_e32 v11, v15 -; VI-NEXT: .LBB13_7: ; %frem.loop_exit +; VI-NEXT: .LBB13_7: ; %frem.loop_exit24 ; VI-NEXT: v_subrev_u32_e32 v14, vcc, 25, v17 ; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; VI-NEXT: s_brev_b32 s2, -2 @@ -16549,7 +16549,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| ; VI-NEXT: s_and_b64 vcc, exec, s[2:3] ; VI-NEXT: s_cbranch_vccz .LBB13_10 -; VI-NEXT: ; %bb.9: ; %frem.else16 +; VI-NEXT: ; %bb.9: ; %frem.else ; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; VI-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; VI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16558,7 +16558,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: s_branch .LBB13_16 ; VI-NEXT: .LBB13_10: ; VI-NEXT: ; implicit-def: $vgpr10_vgpr11 -; VI-NEXT: .LBB13_11: ; %frem.compute15 +; VI-NEXT: .LBB13_11: ; %frem.compute ; VI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; VI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] ; VI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] @@ -16582,10 +16582,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 ; VI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; VI-NEXT: s_cbranch_vccnz .LBB13_15 -; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; VI-NEXT: v_sub_u32_e32 v16, vcc, v16, v17 ; VI-NEXT: v_add_u32_e32 v19, vcc, 26, v16 -; VI-NEXT: .LBB13_13: ; %frem.loop_body23 +; VI-NEXT: .LBB13_13: ; %frem.loop_body ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v17, v13 ; VI-NEXT: v_mov_b32_e32 v16, v12 @@ -16603,7 +16603,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-NEXT: ; %bb.14: ; %Flow ; VI-NEXT: v_mov_b32_e32 v12, v16 ; VI-NEXT: v_mov_b32_e32 v13, v17 -; VI-NEXT: .LBB13_15: ; %frem.loop_exit24 +; VI-NEXT: .LBB13_15: ; %frem.loop_exit ; VI-NEXT: v_subrev_u32_e32 v16, vcc, 25, v19 ; VI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; VI-NEXT: s_brev_b32 s2, -2 @@ -16647,7 +16647,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB13_2 -; GFX9-NEXT: ; %bb.1: ; %frem.else +; GFX9-NEXT: ; %bb.1: ; %frem.else16 ; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]| ; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc @@ -16656,7 +16656,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB13_8 ; GFX9-NEXT: .LBB13_2: ; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX9-NEXT: .LBB13_3: ; %frem.compute +; GFX9-NEXT: .LBB13_3: ; %frem.compute15 ; GFX9-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5] ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1] @@ -16680,10 +16680,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17 ; GFX9-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX9-NEXT: v_sub_u32_e32 v14, v14, v15 ; GFX9-NEXT: v_add_u32_e32 v17, 26, v14 -; GFX9-NEXT: .LBB13_5: ; %frem.loop_body +; GFX9-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v15, v11 ; GFX9-NEXT: v_mov_b32_e32 v14, v10 @@ -16701,7 +16701,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: ; %bb.6: ; %Flow51 ; GFX9-NEXT: v_mov_b32_e32 v10, v14 ; GFX9-NEXT: v_mov_b32_e32 v11, v15 -; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX9-NEXT: v_subrev_u32_e32 v14, 25, v17 ; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; GFX9-NEXT: s_brev_b32 s2, -2 @@ -16718,7 +16718,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]| ; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB13_10 -; GFX9-NEXT: ; %bb.9: ; %frem.else16 +; GFX9-NEXT: ; %bb.9: ; %frem.else ; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]| ; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc @@ -16727,7 +16727,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_branch .LBB13_16 ; GFX9-NEXT: .LBB13_10: ; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX9-NEXT: .LBB13_11: ; %frem.compute15 +; GFX9-NEXT: .LBB13_11: ; %frem.compute ; GFX9-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7] ; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3] @@ -16751,10 +16751,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19 ; GFX9-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX9-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX9-NEXT: v_sub_u32_e32 v16, v16, v17 ; GFX9-NEXT: v_add_u32_e32 v19, 26, v16 -; GFX9-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX9-NEXT: .LBB13_13: ; %frem.loop_body ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-NEXT: v_mov_b32_e32 v16, v12 @@ -16772,7 +16772,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: ; %bb.14: ; %Flow ; GFX9-NEXT: v_mov_b32_e32 v12, v16 ; GFX9-NEXT: v_mov_b32_e32 v13, v17 -; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX9-NEXT: v_subrev_u32_e32 v16, 25, v19 ; GFX9-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; GFX9-NEXT: s_brev_b32 s2, -2 @@ -16817,7 +16817,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB13_2 -; GFX10-NEXT: ; %bb.1: ; %frem.else +; GFX10-NEXT: ; %bb.1: ; %frem.else16 ; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX10-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo @@ -16826,7 +16826,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB13_8 ; GFX10-NEXT: .LBB13_2: ; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX10-NEXT: .LBB13_3: ; %frem.compute +; GFX10-NEXT: .LBB13_3: ; %frem.compute15 ; GFX10-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -16851,10 +16851,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 ; GFX10-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 26 -; GFX10-NEXT: .LBB13_5: ; %frem.loop_body +; GFX10-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; GFX10-NEXT: v_mov_b32_e32 v14, v10 @@ -16873,7 +16873,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_mov_b32_e32 v10, v14 ; GFX10-NEXT: v_mov_b32_e32 v17, s2 ; GFX10-NEXT: v_mov_b32_e32 v11, v15 -; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13] @@ -16889,7 +16889,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_vccz .LBB13_10 -; GFX10-NEXT: ; %bb.9: ; %frem.else16 +; GFX10-NEXT: ; %bb.9: ; %frem.else ; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX10-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo @@ -16898,7 +16898,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: s_branch .LBB13_16 ; GFX10-NEXT: .LBB13_10: ; GFX10-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX10-NEXT: .LBB13_11: ; %frem.compute15 +; GFX10-NEXT: .LBB13_11: ; %frem.compute ; GFX10-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -16923,10 +16923,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 ; GFX10-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX10-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX10-NEXT: s_sub_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s2, s2, 26 -; GFX10-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX10-NEXT: .LBB13_13: ; %frem.loop_body ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mov_b32_e32 v17, v13 ; GFX10-NEXT: v_mov_b32_e32 v16, v12 @@ -16945,7 +16945,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX10-NEXT: v_mov_b32_e32 v12, v16 ; GFX10-NEXT: v_mov_b32_e32 v19, s2 ; GFX10-NEXT: v_mov_b32_e32 v13, v17 -; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15] @@ -16986,7 +16986,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB13_2 -; GFX11-NEXT: ; %bb.1: ; %frem.else +; GFX11-NEXT: ; %bb.1: ; %frem.else16 ; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX11-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -16996,7 +16996,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB13_8 ; GFX11-NEXT: .LBB13_2: ; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX11-NEXT: .LBB13_3: ; %frem.compute +; GFX11-NEXT: .LBB13_3: ; %frem.compute15 ; GFX11-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -17029,12 +17029,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 26 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_5: ; %frem.loop_body +; GFX11-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 @@ -17054,7 +17054,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.6: ; %Flow51 ; GFX11-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 ; GFX11-NEXT: v_mov_b32_e32 v11, v15 -; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 @@ -17074,7 +17074,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB13_10 -; GFX11-NEXT: ; %bb.9: ; %frem.else16 +; GFX11-NEXT: ; %bb.9: ; %frem.else ; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX11-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17084,7 +17084,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_branch .LBB13_16 ; GFX11-NEXT: .LBB13_10: ; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX11-NEXT: .LBB13_11: ; %frem.compute15 +; GFX11-NEXT: .LBB13_11: ; %frem.compute ; GFX11-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -17117,12 +17117,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX11-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX11-NEXT: s_sub_i32 s2, s2, s3 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s2, 26 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX11-NEXT: .LBB13_13: ; %frem.loop_body ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 @@ -17142,7 +17142,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: ; %bb.14: ; %Flow ; GFX11-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 ; GFX11-NEXT: v_mov_b32_e32 v13, v17 -; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX11-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 @@ -17187,7 +17187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1150-NEXT: s_cbranch_vccz .LBB13_2 -; GFX1150-NEXT: ; %bb.1: ; %frem.else +; GFX1150-NEXT: ; %bb.1: ; %frem.else16 ; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX1150-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17197,7 +17197,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB13_8 ; GFX1150-NEXT: .LBB13_2: ; GFX1150-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1150-NEXT: .LBB13_3: ; %frem.compute +; GFX1150-NEXT: .LBB13_3: ; %frem.compute15 ; GFX1150-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -17229,12 +17229,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 ; GFX1150-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1150-NEXT: s_sub_i32 s2, s2, s3 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s2, s2, 26 ; GFX1150-NEXT: .p2align 6 -; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body +; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 @@ -17254,7 +17254,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.6: ; %Flow51 ; GFX1150-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 ; GFX1150-NEXT: v_mov_b32_e32 v11, v15 -; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 @@ -17274,7 +17274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]| ; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1150-NEXT: s_cbranch_vccz .LBB13_10 -; GFX1150-NEXT: ; %bb.9: ; %frem.else16 +; GFX1150-NEXT: ; %bb.9: ; %frem.else ; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX1150-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17284,7 +17284,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: s_branch .LBB13_16 ; GFX1150-NEXT: .LBB13_10: ; GFX1150-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1150-NEXT: .LBB13_11: ; %frem.compute15 +; GFX1150-NEXT: .LBB13_11: ; %frem.compute ; GFX1150-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -17316,12 +17316,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 ; GFX1150-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX1150-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1150-NEXT: s_sub_i32 s2, s2, s3 ; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1150-NEXT: s_add_i32 s2, s2, 26 ; GFX1150-NEXT: .p2align 6 -; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body ; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1150-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 @@ -17341,7 +17341,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-NEXT: ; %bb.14: ; %Flow ; GFX1150-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 ; GFX1150-NEXT: v_mov_b32_e32 v13, v17 -; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 @@ -17386,7 +17386,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]| ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1200-NEXT: s_cbranch_vccz .LBB13_2 -; GFX1200-NEXT: ; %bb.1: ; %frem.else +; GFX1200-NEXT: ; %bb.1: ; %frem.else16 ; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]| ; GFX1200-NEXT: v_and_b32_e32 v8, 0x80000000, v1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) @@ -17396,7 +17396,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB13_8 ; GFX1200-NEXT: .LBB13_2: ; GFX1200-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GFX1200-NEXT: .LBB13_3: ; %frem.compute +; GFX1200-NEXT: .LBB13_3: ; %frem.compute15 ; GFX1200-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]| ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5] ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1] @@ -17429,11 +17429,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17 ; GFX1200-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB13_7 -; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader +; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader ; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 -; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body +; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body23 ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1200-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10 @@ -17454,7 +17454,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.6: ; %Flow51 ; GFX1200-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14 ; GFX1200-NEXT: v_mov_b32_e32 v11, v15 -; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit +; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit24 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_subrev_nc_u32_e32 v14, 25, v17 ; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14 @@ -17476,7 +17476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_cbranch_vccz .LBB13_10 -; GFX1200-NEXT: ; %bb.9: ; %frem.else16 +; GFX1200-NEXT: ; %bb.9: ; %frem.else ; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]| ; GFX1200-NEXT: v_and_b32_e32 v10, 0x80000000, v3 ; GFX1200-NEXT: s_wait_alu 0xfffd @@ -17487,7 +17487,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: s_branch .LBB13_16 ; GFX1200-NEXT: .LBB13_10: ; GFX1200-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1200-NEXT: .LBB13_11: ; %frem.compute15 +; GFX1200-NEXT: .LBB13_11: ; %frem.compute ; GFX1200-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]| ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7] ; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3] @@ -17520,11 +17520,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19 ; GFX1200-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0 ; GFX1200-NEXT: s_cbranch_vccnz .LBB13_15 -; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader +; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader ; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3 ; GFX1200-NEXT: s_wait_alu 0xfffe ; GFX1200-NEXT: s_add_co_i32 s2, s2, 26 -; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body23 +; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body ; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1200-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12 @@ -17547,7 +17547,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-NEXT: ; %bb.14: ; %Flow ; GFX1200-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16 ; GFX1200-NEXT: v_mov_b32_e32 v13, v17 -; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit24 +; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-NEXT: v_subrev_nc_u32_e32 v16, 25, v19 ; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 3c41cc4..5babe9f 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1111,15 +1111,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,18 +1186,15 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1281,28 +1274,22 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v8i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.l, v1.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6 -; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[2:3], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v8i8: @@ -1416,44 +1403,34 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v16i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v10.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v6.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14 -; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v16i8: @@ -1649,78 +1626,59 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v6.h, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v8.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v11.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v12.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.l, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v16.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v9.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v8.h ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index f67ab18..234eaa8 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4985,21 +4985,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5243,18 +5239,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off -; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off +; GFX11-TRUE16-NEXT: global_store_b8 v[2:3], v4, off +; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 @@ -5528,27 +5520,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[3:4], off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -5994,73 +5980,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v5.h, v5.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off -; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off +; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[9:12], off ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 049663a..f80d50b 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2730,18 +2730,15 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v0.h +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6 -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir new file mode 100644 index 0000000..a4aad57 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir @@ -0,0 +1,59 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s + +--- +name: buffer_load_lds_not_valu +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: buffer_load_lds_not_valu + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 0 + ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0 + ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec + ; CHECK-NEXT: $m0 = S_MOV_B32 1 + ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0 + ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec + ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec + ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0 + ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0 + ; CHECK-NEXT: S_ENDPGM 0 + $exec = IMPLICIT_DEF + %0:vgpr_32 = IMPLICIT_DEF + %1:sgpr_128 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec + %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0 + $m0 = S_MOV_B32 1 + BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0 + %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec + %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec + %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec + %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec + %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec + %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 2, 0 + SCHED_GROUP_BARRIER 4, 1 ,0 + SCHED_GROUP_BARRIER 2, 4, 0 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir index 9553fcc..f11fe4a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir @@ -59,6 +59,15 @@ body: | ... --- +name: src_shared_base_to_vcc +body: | + bb.0: + ; GFX9-LABEL: name: src_shared_base_to_vcc + ; GFX9: $vcc = S_MOV_B64 $src_shared_base + $vcc = COPY $src_shared_base +... + +--- name: sgpr96_aligned_src_dst body: | bb.0: diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir index c8fee5d..7cbe5de 100644 --- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir +++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir @@ -119,9 +119,10 @@ body: | ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]] %2(s16) = G_CTLZ %1 - ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]] - ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]] - ; CHECK: $r0 = COPY [[R]] + ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]] + ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]] + ; LIBCALLS: $r0 = COPY [[R]] + ; CLZ: $r0 = COPY [[R32]] %3(s32) = G_SEXT %2(s16) $r0 = COPY %3(s32) BX_RET 14, $noreg, implicit $r0 diff --git a/llvm/test/CodeGen/ARM/carry.ll b/llvm/test/CodeGen/ARM/carry.ll index 558e2b0..a652241 100644 --- a/llvm/test/CodeGen/ARM/carry.ll +++ b/llvm/test/CodeGen/ARM/carry.ll @@ -1,61 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s define i64 @f1(i64 %a, i64 %b) { ; CHECK-LABEL: f1: -; CHECK: subs r -; CHECK: sbc r +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbc r1, r1, r3 +; CHECK-NEXT: bx lr entry: - %tmp = sub i64 %a, %b - ret i64 %tmp + %tmp = sub i64 %a, %b + ret i64 %tmp } define i64 @f2(i64 %a, i64 %b) { ; CHECK-LABEL: f2: -; CHECK: lsl r -; CHECK: orr r -; CHECK: rsbs r -; CHECK: sbc r +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsl r1, r1, #1 +; CHECK-NEXT: orr r1, r1, r0, lsr #31 +; CHECK-NEXT: rsbs r0, r2, r0, lsl #1 +; CHECK-NEXT: sbc r1, r1, r3 +; CHECK-NEXT: bx lr entry: - %tmp1 = shl i64 %a, 1 - %tmp2 = sub i64 %tmp1, %b - ret i64 %tmp2 + %tmp1 = shl i64 %a, 1 + %tmp2 = sub i64 %tmp1, %b + ret i64 %tmp2 } ; add with live carry define i64 @f3(i32 %al, i32 %bl) { ; CHECK-LABEL: f3: -; CHECK: adds r -; CHECK: adc r +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: adcs r0, r1, #0 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: bx lr entry: - ; unsigned wide add - %aw = zext i32 %al to i64 - %bw = zext i32 %bl to i64 - %cw = add i64 %aw, %bw - ; ch == carry bit - %ch = lshr i64 %cw, 32 - %dw = add i64 %ch, %bw - ret i64 %dw + ; unsigned wide add + %aw = zext i32 %al to i64 + %bw = zext i32 %bl to i64 + %cw = add i64 %aw, %bw + ; ch == carry bit + %ch = lshr i64 %cw, 32 + %dw = add i64 %ch, %bw + ret i64 %dw } ; rdar://10073745 define i64 @f4(i64 %x) nounwind readnone { -entry: ; CHECK-LABEL: f4: -; CHECK: rsbs r -; CHECK: rsc r +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: rsc r1, r1, #0 +; CHECK-NEXT: bx lr +entry: %0 = sub nsw i64 0, %x ret i64 %0 } ; rdar://12559385 define i64 @f5(i32 %vi) { -entry: ; CHECK-LABEL: f5: -; CHECK: movw [[REG:r[0-9]+]], #36102 -; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]] - %v0 = zext i32 %vi to i64 - %v1 = xor i64 %v0, -155057456198619 - %v4 = add i64 %v1, 155057456198619 - %v5 = add i64 %v4, %v1 - ret i64 %v5 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movw r1, #19493 +; CHECK-NEXT: movw r2, #29433 +; CHECK-NEXT: movt r1, #57191 +; CHECK-NEXT: eor r0, r0, r1 +; CHECK-NEXT: movw r3, #46043 +; CHECK-NEXT: movt r2, #65535 +; CHECK-NEXT: adds r0, r0, r0 +; CHECK-NEXT: movw r1, #36102 +; CHECK-NEXT: sbc r2, r2, r1 +; CHECK-NEXT: movt r3, #8344 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc r1, r2, r1 +; CHECK-NEXT: bx lr +entry: + %v0 = zext i32 %vi to i64 + %v1 = xor i64 %v0, -155057456198619 + %v4 = add i64 %v1, 155057456198619 + %v5 = add i64 %v4, %v1 + ret i64 %v5 } diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll index bea0310..70224fc 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll @@ -94,6 +94,18 @@ define void @main() #0 { %uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0( i32 4, i32 0, i32 10, i32 5, ptr null) + + ; RWBuffer<float4> UnboundedArray[] : register(u10, space5) +; CHECK: - Type: UAVTyped +; CHECK: Space: 5 +; CHECK: LowerBound: 10 +; CHECK: UpperBound: 4294967295 +; CHECK: Kind: TypedBuffer +; CHECK: Flags: +; CHECK: UsedByAtomic64: false + ; RWBuffer<float4> Buf = BufferArray[100]; + %uav3 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 5, i32 10, i32 -1, i32 100, ptr null) ret void } diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir index 1030917..302f70f 100644 --- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir +++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck --strict-whitespace %s --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll index 1edb387..f345e08 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll @@ -2,9 +2,13 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll index 2e80c4c..29b130f 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll @@ -2,9 +2,13 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr) declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll index 817b1d5..4e463a14 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1( diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll index cbf647f..fc8cce4 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} declare void @llvm.nvvm.tcgen05.fence.before.thread.sync() declare void @llvm.nvvm.tcgen05.fence.after.thread.sync() diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll index a37b1a9..22eb729 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll @@ -2,9 +2,13 @@ ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100f | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %} ; CHECK-LABEL: nvvm_tcgen05_ld_16x64b define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) { diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll index bf2adac..33483b5 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | %ptxas-verify -arch=sm_110a %} declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll index 0636a06..ccf6541 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll @@ -2,9 +2,13 @@ ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %} ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %} +; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %} +; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %} ; CHECK-LABEL: nvvm_tcgen05_st_16x64b define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) { diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll index 0f7e0c7..1325abf 100644 --- a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll +++ b/llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll @@ -95,3 +95,80 @@ declare i4 @llvm.ctpop.i4(i4) #1 !6 = !{!"short", !7, i64 0} !7 = !{!"omnipotent char", !8, i64 0} !8 = !{!"Simple C/C++ TBAA"} + +; Function to lockdown changes for floating point vector comparisons +define range(i32 0, 5) i32 @cols_needed(ptr %colauths){ +; POWERPC_64LE-LABEL: cols_needed: +; POWERPC_64LE: # %bb.0: # %entry +; POWERPC_64LE-NEXT: lxv vs0, 0(r3) +; POWERPC_64LE-NEXT: xxlxor vs1, vs1, vs1 +; POWERPC_64LE-NEXT: li r4, 4 +; POWERPC_64LE-NEXT: li r3, 0 +; POWERPC_64LE-NEXT: xvcmpeqsp vs0, vs0, vs1 +; POWERPC_64LE-NEXT: xxlnor v2, vs0, vs0 +; POWERPC_64LE-NEXT: vextuwrx r4, r4, v2 +; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2 +; POWERPC_64LE-NEXT: rlwinm r4, r4, 1, 30, 30 +; POWERPC_64LE-NEXT: sub r3, r4, r3 +; POWERPC_64LE-NEXT: mfvsrwz r4, v2 +; POWERPC_64LE-NEXT: rlwinm r4, r4, 2, 29, 29 +; POWERPC_64LE-NEXT: or r3, r3, r4 +; POWERPC_64LE-NEXT: li r4, 12 +; POWERPC_64LE-NEXT: vextuwrx r4, r4, v2 +; POWERPC_64LE-NEXT: slwi r4, r4, 3 +; POWERPC_64LE-NEXT: or r3, r3, r4 +; POWERPC_64LE-NEXT: clrlwi r3, r3, 28 +; POWERPC_64LE-NEXT: stb r3, -1(r1) +; POWERPC_64LE-NEXT: lbz r3, -1(r1) +; POWERPC_64LE-NEXT: popcntd r3, r3 +; POWERPC_64LE-NEXT: blr +; +; POWERPC_64-LABEL: cols_needed: +; POWERPC_64: # %bb.0: # %entry +; POWERPC_64-NEXT: lxv vs0, 0(r3) +; POWERPC_64-NEXT: xxlxor vs1, vs1, vs1 +; POWERPC_64-NEXT: li r4, 8 +; POWERPC_64-NEXT: xvcmpeqsp vs0, vs0, vs1 +; POWERPC_64-NEXT: xxlnor v2, vs0, vs0 +; POWERPC_64-NEXT: vextuwlx r4, r4, v2 +; POWERPC_64-NEXT: mfvsrwz r3, v2 +; POWERPC_64-NEXT: rlwinm r4, r4, 1, 30, 30 +; POWERPC_64-NEXT: rlwimi r4, r3, 2, 29, 29 +; POWERPC_64-NEXT: li r3, 0 +; POWERPC_64-NEXT: vextuwlx r3, r3, v2 +; POWERPC_64-NEXT: rlwimi r4, r3, 3, 0, 28 +; POWERPC_64-NEXT: li r3, 12 +; POWERPC_64-NEXT: vextuwlx r3, r3, v2 +; POWERPC_64-NEXT: sub r3, r4, r3 +; POWERPC_64-NEXT: clrlwi r3, r3, 28 +; POWERPC_64-NEXT: stb r3, -1(r1) +; POWERPC_64-NEXT: lbz r3, -1(r1) +; POWERPC_64-NEXT: popcntd r3, r3 +; POWERPC_64-NEXT: blr +; +; POWERPC_32-LABEL: cols_needed: +; POWERPC_32: # %bb.0: # %entry +; POWERPC_32-NEXT: lxv vs0, 0(r3) +; POWERPC_32-NEXT: xxlxor vs1, vs1, vs1 +; POWERPC_32-NEXT: xvcmpeqsp vs0, vs0, vs1 +; POWERPC_32-NEXT: xxlnor vs0, vs0, vs0 +; POWERPC_32-NEXT: stxv vs0, -32(r1) +; POWERPC_32-NEXT: lwz r3, -24(r1) +; POWERPC_32-NEXT: lwz r4, -28(r1) +; POWERPC_32-NEXT: rlwinm r3, r3, 1, 30, 30 +; POWERPC_32-NEXT: rlwimi r3, r4, 2, 29, 29 +; POWERPC_32-NEXT: lwz r4, -32(r1) +; POWERPC_32-NEXT: rlwimi r3, r4, 3, 0, 28 +; POWERPC_32-NEXT: lwz r4, -20(r1) +; POWERPC_32-NEXT: sub r3, r3, r4 +; POWERPC_32-NEXT: clrlwi r3, r3, 28 +; POWERPC_32-NEXT: popcntw r3, r3 +; POWERPC_32-NEXT: blr +entry: + %0 = load <4 x float>, ptr %colauths, align 4, !tbaa !5 + %1 = fcmp une <4 x float> %0, zeroinitializer + %2 = bitcast <4 x i1> %1 to i4 + %3 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2) + %4 = zext nneg i4 %3 to i32 + ret i32 %4 +} diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll index 2a46a59..4f036d3 100644 --- a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll +++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll @@ -221,8 +221,8 @@ define i64 @test12(i64 %0) #0 { ; ; RV64-LABEL: test12: ; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a0, a0, -16 -; RV64-NEXT: addi a0, a0, 13 +; RV64-NEXT: addi a0, a0, -16 +; RV64-NEXT: addiw a0, a0, 13 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/i64-icmp.ll b/llvm/test/CodeGen/RISCV/i64-icmp.ll index 88d989d..2742b9a 100644 --- a/llvm/test/CodeGen/RISCV/i64-icmp.ll +++ b/llvm/test/CodeGen/RISCV/i64-icmp.ll @@ -708,8 +708,7 @@ define i64 @icmp_sle_constant_neg_2050(i64 %a) nounwind { define i64 @icmp_eq_zext_inreg_small_constant(i64 %a) nounwind { ; RV64I-LABEL: icmp_eq_zext_inreg_small_constant: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a0, a0, -123 +; RV64I-NEXT: addiw a0, a0, -123 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = and i64 %a, 4294967295 @@ -748,8 +747,7 @@ define i64 @icmp_ne_zext_inreg_small_constant(i64 %a) nounwind { define i64 @icmp_ne_zext_inreg_large_constant(i64 %a) nounwind { ; RV64I-LABEL: icmp_ne_zext_inreg_large_constant: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a0, a0, 2 +; RV64I-NEXT: addiw a0, a0, 2 ; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: ret %1 = and i64 %a, 4294967295 diff --git a/llvm/test/CodeGen/RISCV/min-max.ll b/llvm/test/CodeGen/RISCV/min-max.ll index acde8ad..e7f6899 100644 --- a/llvm/test/CodeGen/RISCV/min-max.ll +++ b/llvm/test/CodeGen/RISCV/min-max.ll @@ -5,6 +5,12 @@ ; RUN: FileCheck %s --check-prefixes=ZBB,RV32ZBB ; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | \ ; RUN: FileCheck %s --check-prefixes=ZBB,RV64ZBB +; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s | \ +; RUN: FileCheck %s --check-prefixes=XQCI +; RUN: llc < %s -mtriple=riscv32 -mattr=+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB +; RUN: llc < %s -mtriple=riscv64 -mattr=+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB ; Basic tests. @@ -23,6 +29,27 @@ define signext i8 @smax_i8(i8 signext %a, i8 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: max a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smax_i8: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvge a0, a1, a0, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_i8: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: blt a1, a0, .LBB0_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB0_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_i8: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a1, a0, .LBB0_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB0_2: +; RV64I-SFB-NEXT: ret %c = call i8 @llvm.smax.i8(i8 %a, i8 %b) ret i8 %c } @@ -42,6 +69,27 @@ define signext i16 @smax_i16(i16 signext %a, i16 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: max a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smax_i16: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvge a0, a1, a0, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_i16: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: blt a1, a0, .LBB1_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB1_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_i16: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a1, a0, .LBB1_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB1_2: +; RV64I-SFB-NEXT: ret %c = call i16 @llvm.smax.i16(i16 %a, i16 %b) ret i16 %c } @@ -61,6 +109,27 @@ define signext i32 @smax_i32(i32 signext %a, i32 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: max a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smax_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvge a0, a1, a0, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: blt a1, a0, .LBB2_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB2_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a1, a0, .LBB2_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB2_2: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smax.i32(i32 %a, i32 %b) ret i32 %c } @@ -112,6 +181,41 @@ define i64 @smax_i64(i64 %a, i64 %b) { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: max a0, a0, a1 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: smax_i64: +; XQCI: # %bb.0: +; XQCI-NEXT: sltu a4, a2, a0 +; XQCI-NEXT: slt a5, a3, a1 +; XQCI-NEXT: qc.mveq a5, a1, a3, a4 +; XQCI-NEXT: qc.mveqi a0, a5, 0, a2 +; XQCI-NEXT: qc.mveqi a1, a5, 0, a3 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_i64: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: sltu a4, a2, a0 +; RV32I-SFB-NEXT: slt a5, a3, a1 +; RV32I-SFB-NEXT: bne a1, a3, .LBB3_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a5, a4 +; RV32I-SFB-NEXT: .LBB3_2: +; RV32I-SFB-NEXT: bnez a5, .LBB3_4 +; RV32I-SFB-NEXT: # %bb.3: +; RV32I-SFB-NEXT: mv a0, a2 +; RV32I-SFB-NEXT: .LBB3_4: +; RV32I-SFB-NEXT: bnez a5, .LBB3_6 +; RV32I-SFB-NEXT: # %bb.5: +; RV32I-SFB-NEXT: mv a1, a3 +; RV32I-SFB-NEXT: .LBB3_6: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_i64: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a1, a0, .LBB3_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB3_2: +; RV64I-SFB-NEXT: ret %c = call i64 @llvm.smax.i64(i64 %a, i64 %b) ret i64 %c } @@ -131,6 +235,27 @@ define signext i8 @smin_i8(i8 signext %a, i8 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: min a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smin_i8: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvge a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_i8: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: blt a0, a1, .LBB4_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB4_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_i8: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a0, a1, .LBB4_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB4_2: +; RV64I-SFB-NEXT: ret %c = call i8 @llvm.smin.i8(i8 %a, i8 %b) ret i8 %c } @@ -150,6 +275,27 @@ define signext i16 @smin_i16(i16 signext %a, i16 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: min a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smin_i16: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvge a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_i16: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: blt a0, a1, .LBB5_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB5_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_i16: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a0, a1, .LBB5_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB5_2: +; RV64I-SFB-NEXT: ret %c = call i16 @llvm.smin.i16(i16 %a, i16 %b) ret i16 %c } @@ -169,6 +315,27 @@ define signext i32 @smin_i32(i32 signext %a, i32 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: min a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smin_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvge a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: blt a0, a1, .LBB6_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB6_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a0, a1, .LBB6_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB6_2: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smin.i32(i32 %a, i32 %b) ret i32 %c } @@ -220,6 +387,41 @@ define i64 @smin_i64(i64 %a, i64 %b) { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: min a0, a0, a1 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: smin_i64: +; XQCI: # %bb.0: +; XQCI-NEXT: sltu a4, a0, a2 +; XQCI-NEXT: slt a5, a1, a3 +; XQCI-NEXT: qc.mveq a5, a1, a3, a4 +; XQCI-NEXT: qc.mveqi a0, a5, 0, a2 +; XQCI-NEXT: qc.mveqi a1, a5, 0, a3 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_i64: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: sltu a4, a0, a2 +; RV32I-SFB-NEXT: slt a5, a1, a3 +; RV32I-SFB-NEXT: bne a1, a3, .LBB7_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a5, a4 +; RV32I-SFB-NEXT: .LBB7_2: +; RV32I-SFB-NEXT: bnez a5, .LBB7_4 +; RV32I-SFB-NEXT: # %bb.3: +; RV32I-SFB-NEXT: mv a0, a2 +; RV32I-SFB-NEXT: .LBB7_4: +; RV32I-SFB-NEXT: bnez a5, .LBB7_6 +; RV32I-SFB-NEXT: # %bb.5: +; RV32I-SFB-NEXT: mv a1, a3 +; RV32I-SFB-NEXT: .LBB7_6: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_i64: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: blt a0, a1, .LBB7_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB7_2: +; RV64I-SFB-NEXT: ret %c = call i64 @llvm.smin.i64(i64 %a, i64 %b) ret i64 %c } @@ -239,6 +441,27 @@ define i8 @umax_i8(i8 zeroext %a, i8 zeroext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: maxu a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: umax_i8: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvgeu a0, a1, a0, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_i8: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: bltu a1, a0, .LBB8_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB8_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_i8: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a1, a0, .LBB8_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB8_2: +; RV64I-SFB-NEXT: ret %c = call i8 @llvm.umax.i8(i8 %a, i8 %b) ret i8 %c } @@ -258,6 +481,27 @@ define i16 @umax_i16(i16 zeroext %a, i16 zeroext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: maxu a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: umax_i16: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvgeu a0, a1, a0, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_i16: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: bltu a1, a0, .LBB9_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB9_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_i16: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a1, a0, .LBB9_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB9_2: +; RV64I-SFB-NEXT: ret %c = call i16 @llvm.umax.i16(i16 %a, i16 %b) ret i16 %c } @@ -277,6 +521,27 @@ define signext i32 @umax_i32(i32 signext %a, i32 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: maxu a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: umax_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvgeu a0, a1, a0, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: bltu a1, a0, .LBB10_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB10_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a1, a0, .LBB10_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB10_2: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.umax.i32(i32 %a, i32 %b) ret i32 %c } @@ -328,6 +593,41 @@ define i64 @umax_i64(i64 %a, i64 %b) { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: maxu a0, a0, a1 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: umax_i64: +; XQCI: # %bb.0: +; XQCI-NEXT: sltu a4, a2, a0 +; XQCI-NEXT: sltu a5, a3, a1 +; XQCI-NEXT: qc.mveq a5, a1, a3, a4 +; XQCI-NEXT: qc.mveqi a0, a5, 0, a2 +; XQCI-NEXT: qc.mveqi a1, a5, 0, a3 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_i64: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: sltu a4, a2, a0 +; RV32I-SFB-NEXT: sltu a5, a3, a1 +; RV32I-SFB-NEXT: bne a1, a3, .LBB11_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a5, a4 +; RV32I-SFB-NEXT: .LBB11_2: +; RV32I-SFB-NEXT: bnez a5, .LBB11_4 +; RV32I-SFB-NEXT: # %bb.3: +; RV32I-SFB-NEXT: mv a0, a2 +; RV32I-SFB-NEXT: .LBB11_4: +; RV32I-SFB-NEXT: bnez a5, .LBB11_6 +; RV32I-SFB-NEXT: # %bb.5: +; RV32I-SFB-NEXT: mv a1, a3 +; RV32I-SFB-NEXT: .LBB11_6: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_i64: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a1, a0, .LBB11_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB11_2: +; RV64I-SFB-NEXT: ret %c = call i64 @llvm.umax.i64(i64 %a, i64 %b) ret i64 %c } @@ -347,6 +647,27 @@ define zeroext i8 @umin_i8(i8 zeroext %a, i8 zeroext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: minu a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: umin_i8: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvgeu a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umin_i8: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: bltu a0, a1, .LBB12_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB12_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umin_i8: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a0, a1, .LBB12_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB12_2: +; RV64I-SFB-NEXT: ret %c = call i8 @llvm.umin.i8(i8 %a, i8 %b) ret i8 %c } @@ -366,6 +687,27 @@ define zeroext i16 @umin_i16(i16 zeroext %a, i16 zeroext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: minu a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: umin_i16: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvgeu a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umin_i16: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: bltu a0, a1, .LBB13_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB13_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umin_i16: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a0, a1, .LBB13_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB13_2: +; RV64I-SFB-NEXT: ret %c = call i16 @llvm.umin.i16(i16 %a, i16 %b) ret i16 %c } @@ -385,6 +727,27 @@ define signext i32 @umin_i32(i32 signext %a, i32 signext %b) { ; ZBB: # %bb.0: ; ZBB-NEXT: minu a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: umin_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.mvgeu a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umin_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: bltu a0, a1, .LBB14_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB14_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umin_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a0, a1, .LBB14_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB14_2: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.umin.i32(i32 %a, i32 %b) ret i32 %c } @@ -436,6 +799,41 @@ define i64 @umin_i64(i64 %a, i64 %b) { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: minu a0, a0, a1 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: umin_i64: +; XQCI: # %bb.0: +; XQCI-NEXT: sltu a4, a0, a2 +; XQCI-NEXT: sltu a5, a1, a3 +; XQCI-NEXT: qc.mveq a5, a1, a3, a4 +; XQCI-NEXT: qc.mveqi a0, a5, 0, a2 +; XQCI-NEXT: qc.mveqi a1, a5, 0, a3 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umin_i64: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: sltu a4, a0, a2 +; RV32I-SFB-NEXT: sltu a5, a1, a3 +; RV32I-SFB-NEXT: bne a1, a3, .LBB15_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a5, a4 +; RV32I-SFB-NEXT: .LBB15_2: +; RV32I-SFB-NEXT: bnez a5, .LBB15_4 +; RV32I-SFB-NEXT: # %bb.3: +; RV32I-SFB-NEXT: mv a0, a2 +; RV32I-SFB-NEXT: .LBB15_4: +; RV32I-SFB-NEXT: bnez a5, .LBB15_6 +; RV32I-SFB-NEXT: # %bb.5: +; RV32I-SFB-NEXT: mv a1, a3 +; RV32I-SFB-NEXT: .LBB15_6: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umin_i64: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: bltu a0, a1, .LBB15_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB15_2: +; RV64I-SFB-NEXT: ret %c = call i64 @llvm.umin.i64(i64 %a, i64 %b) ret i64 %c } @@ -450,6 +848,18 @@ define signext i32 @smin_same_op_i32(i32 signext %a) { ; ZBB-LABEL: smin_same_op_i32: ; ZBB: # %bb.0: ; ZBB-NEXT: ret +; +; XQCI-LABEL: smin_same_op_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_same_op_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_same_op_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smin.i32(i32 %a, i32 %a) ret i32 %c } @@ -462,6 +872,18 @@ define signext i32 @smax_same_op_i32(i32 signext %a) { ; ZBB-LABEL: smax_same_op_i32: ; ZBB: # %bb.0: ; ZBB-NEXT: ret +; +; XQCI-LABEL: smax_same_op_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_same_op_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_same_op_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smax.i32(i32 %a, i32 %a) ret i32 %c } @@ -474,6 +896,18 @@ define signext i32 @umin_same_op_i32(i32 signext %a) { ; ZBB-LABEL: umin_same_op_i32: ; ZBB: # %bb.0: ; ZBB-NEXT: ret +; +; XQCI-LABEL: umin_same_op_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umin_same_op_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umin_same_op_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.umin.i32(i32 %a, i32 %a) ret i32 %c } @@ -486,6 +920,18 @@ define signext i32 @umax_same_op_i32(i32 signext %a) { ; ZBB-LABEL: umax_same_op_i32: ; ZBB: # %bb.0: ; ZBB-NEXT: ret +; +; XQCI-LABEL: umax_same_op_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_same_op_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_same_op_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.umax.i32(i32 %a, i32 %a) ret i32 %c } @@ -510,6 +956,19 @@ define signext i32 @smin_undef_i32() { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a0, 0 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: smin_undef_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_undef_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_undef_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a0, 0 +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smin.i32(i32 undef, i32 undef) ret i32 %c } @@ -532,6 +991,19 @@ define signext i32 @smax_undef_i32() { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a0, 0 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: smax_undef_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_undef_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_undef_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a0, 0 +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smax.i32(i32 undef, i32 undef) ret i32 %c } @@ -554,6 +1026,19 @@ define signext i32 @umin_undef_i32() { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a0, 0 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: umin_undef_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umin_undef_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umin_undef_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a0, 0 +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.umin.i32(i32 undef, i32 undef) ret i32 %c } @@ -576,6 +1061,19 @@ define signext i32 @umax_undef_i32() { ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a0, 0 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: umax_undef_i32: +; XQCI: # %bb.0: +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_undef_i32: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_undef_i32: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a0, 0 +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.umax.i32(i32 undef, i32 undef) ret i32 %c } @@ -595,6 +1093,29 @@ define signext i32 @smax_i32_pos_constant(i32 signext %a) { ; ZBB-NEXT: li a1, 10 ; ZBB-NEXT: max a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smax_i32_pos_constant: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.lilti a0, a0, 11, 10 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_i32_pos_constant: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: li a1, 10 +; RV32I-SFB-NEXT: blt a1, a0, .LBB24_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB24_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_i32_pos_constant: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a1, 10 +; RV64I-SFB-NEXT: blt a1, a0, .LBB24_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB24_2: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smax.i32(i32 %a, i32 10) ret i32 %c } @@ -616,6 +1137,33 @@ define signext i32 @smax_i32_pos_constant_trailing_zeros(i32 signext %a) { ; ZBB-NEXT: li a1, 16 ; ZBB-NEXT: max a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smax_i32_pos_constant_trailing_zeros: +; XQCI: # %bb.0: +; XQCI-NEXT: andi a1, a0, -8 +; XQCI-NEXT: li a0, 16 +; XQCI-NEXT: qc.mvlt a0, a0, a1, a1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: andi a1, a0, -8 +; RV32I-SFB-NEXT: li a0, 16 +; RV32I-SFB-NEXT: bge a0, a1, .LBB25_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB25_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: andi a1, a0, -8 +; RV64I-SFB-NEXT: li a0, 16 +; RV64I-SFB-NEXT: bge a0, a1, .LBB25_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB25_2: +; RV64I-SFB-NEXT: ret %b = and i32 %a, -8 %c = call i32 @llvm.smax.i32(i32 %b, i32 16) %d = and i32 %c, -4 @@ -635,6 +1183,29 @@ define signext i32 @smin_i32_negone(i32 signext %a) { ; ZBB-NEXT: li a1, -1 ; ZBB-NEXT: min a0, a0, a1 ; ZBB-NEXT: ret +; +; XQCI-LABEL: smin_i32_negone: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.ligei a0, a0, 0, -1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_i32_negone: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: li a1, -1 +; RV32I-SFB-NEXT: bltz a0, .LBB26_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a1 +; RV32I-SFB-NEXT: .LBB26_2: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_i32_negone: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a1, -1 +; RV64I-SFB-NEXT: bltz a0, .LBB26_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB26_2: +; RV64I-SFB-NEXT: ret %c = call i32 @llvm.smin.i32(i32 %a, i32 -1) ret i32 %c } @@ -672,6 +1243,34 @@ define i64 @smin_i64_negone(i64 %a) { ; RV64ZBB-NEXT: li a1, -1 ; RV64ZBB-NEXT: min a0, a0, a1 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: smin_i64_negone: +; XQCI: # %bb.0: +; XQCI-NEXT: qc.ligei a0, a1, 0, -1 +; XQCI-NEXT: qc.ligei a1, a1, 0, -1 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: smin_i64_negone: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: li a2, -1 +; RV32I-SFB-NEXT: bltz a1, .LBB27_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a0, a2 +; RV32I-SFB-NEXT: .LBB27_2: +; RV32I-SFB-NEXT: bltz a1, .LBB27_4 +; RV32I-SFB-NEXT: # %bb.3: +; RV32I-SFB-NEXT: mv a1, a2 +; RV32I-SFB-NEXT: .LBB27_4: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: smin_i64_negone: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a1, -1 +; RV64I-SFB-NEXT: bltz a0, .LBB27_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB27_2: +; RV64I-SFB-NEXT: ret %c = call i64 @llvm.smin.i64(i64 %a, i64 -1) ret i64 %c } @@ -720,6 +1319,41 @@ define i64 @umax_i64_one(i64 %a, i64 %b) { ; RV64ZBB-NEXT: li a1, 1 ; RV64ZBB-NEXT: maxu a0, a0, a1 ; RV64ZBB-NEXT: ret +; +; XQCI-LABEL: umax_i64_one: +; XQCI: # %bb.0: +; XQCI-NEXT: mv a2, a1 +; XQCI-NEXT: qc.selectinei a2, 0, a0, 1 +; XQCI-NEXT: qc.liltui a0, a0, 2, 1 +; XQCI-NEXT: qc.mvnei a0, a1, 0, a2 +; XQCI-NEXT: ret +; +; RV32I-SFB-LABEL: umax_i64_one: +; RV32I-SFB: # %bb.0: +; RV32I-SFB-NEXT: li a2, 1 +; RV32I-SFB-NEXT: li a3, 1 +; RV32I-SFB-NEXT: beqz a1, .LBB28_2 +; RV32I-SFB-NEXT: # %bb.1: +; RV32I-SFB-NEXT: mv a3, a0 +; RV32I-SFB-NEXT: .LBB28_2: +; RV32I-SFB-NEXT: bnez a0, .LBB28_4 +; RV32I-SFB-NEXT: # %bb.3: +; RV32I-SFB-NEXT: mv a0, a2 +; RV32I-SFB-NEXT: .LBB28_4: +; RV32I-SFB-NEXT: beqz a1, .LBB28_6 +; RV32I-SFB-NEXT: # %bb.5: +; RV32I-SFB-NEXT: mv a0, a3 +; RV32I-SFB-NEXT: .LBB28_6: +; RV32I-SFB-NEXT: ret +; +; RV64I-SFB-LABEL: umax_i64_one: +; RV64I-SFB: # %bb.0: +; RV64I-SFB-NEXT: li a1, 1 +; RV64I-SFB-NEXT: bnez a0, .LBB28_2 +; RV64I-SFB-NEXT: # %bb.1: +; RV64I-SFB-NEXT: mv a0, a1 +; RV64I-SFB-NEXT: .LBB28_2: +; RV64I-SFB-NEXT: ret %c = call i64 @llvm.umax.i64(i64 %a, i64 1) ret i64 %c } diff --git a/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll new file mode 100644 index 0000000..c19e93d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O1 -mtriple=riscv64 -mattr=+v < %s | FileCheck %s + +define i32 @pr134424(i64 %input_value, i32 %base_value, i1 %cond_flag1, i1 %cond_flag2, i1 %cond_flag3) { +; CHECK-LABEL: pr134424: +; CHECK: # %bb.0: # %for.body.us.preheader.i +; CHECK-NEXT: andi a3, a3, 1 +; CHECK-NEXT: andi a5, a2, 1 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 14 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: bnez a5, .LBB0_2 +; CHECK-NEXT: # %bb.1: # %for.body.us.preheader.i +; CHECK-NEXT: li a2, 1 +; CHECK-NEXT: .LBB0_2: # %for.body.us.preheader.i +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: andi a4, a4, 1 +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: bnez a3, .LBB0_4 +; CHECK-NEXT: # %bb.3: # %for.body.us.preheader.i +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: .LBB0_4: # %for.body.us.preheader.i +; CHECK-NEXT: vmsle.vi v0, v8, 0 +; CHECK-NEXT: sext.w a2, a2 +; CHECK-NEXT: bnez a4, .LBB0_6 +; CHECK-NEXT: # %bb.5: # %for.body.us.preheader.i +; CHECK-NEXT: li a1, 1 +; CHECK-NEXT: .LBB0_6: # %for.body.us.preheader.i +; CHECK-NEXT: sext.w a0, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vredmin.vs v8, v8, v8 +; CHECK-NEXT: vmv.x.s a3, v8 +; CHECK-NEXT: sext.w a1, a1 +; CHECK-NEXT: bge a3, a2, .LBB0_11 +; CHECK-NEXT: # %bb.7: # %for.body.us.preheader.i +; CHECK-NEXT: bge a0, a1, .LBB0_12 +; CHECK-NEXT: .LBB0_8: # %for.body.us.preheader.i +; CHECK-NEXT: blt a3, a0, .LBB0_10 +; CHECK-NEXT: .LBB0_9: # %for.body.us.preheader.i +; CHECK-NEXT: mv a3, a0 +; CHECK-NEXT: .LBB0_10: # %for.body.us.preheader.i +; CHECK-NEXT: sw a3, 0(zero) +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_11: # %for.body.us.preheader.i +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: blt a0, a1, .LBB0_8 +; CHECK-NEXT: .LBB0_12: # %for.body.us.preheader.i +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: bge a3, a0, .LBB0_9 +; CHECK-NEXT: j .LBB0_10 +for.body.us.preheader.i: + %partial_vector = insertelement <4 x i64> zeroinitializer, i64 %input_value, i64 1 + %comparison_vector = shufflevector <4 x i64> %partial_vector, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 1, i32 1> + %comparison_result = icmp sle <4 x i64> %comparison_vector, zeroinitializer + %selected_value1 = select i1 %cond_flag1, i32 %base_value, i32 1 + %selected_value2 = select i1 %cond_flag2, i32 %base_value, i32 1 + %selected_value3 = select i1 %cond_flag3, i32 %base_value, i32 1 + %bool_to_int = zext <4 x i1> %comparison_result to <4 x i32> + %extended_vector = shufflevector <4 x i32> %bool_to_int, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison> + %vector_min = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %extended_vector) + %min1 = call i32 @llvm.smin.i32(i32 %vector_min, i32 %selected_value1) + %min2 = call i32 @llvm.smin.i32(i32 %selected_value2, i32 %selected_value3) + %final_min = call i32 @llvm.smin.i32(i32 %min1, i32 %min2) + store i32 %final_min, ptr null, align 4 + ret i32 0 +} + diff --git a/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir new file mode 100644 index 0000000..aeab8f6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir @@ -0,0 +1,57 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=register-coalescer -o - %s | FileCheck %s + +--- +name: pr71023 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: pr71023 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $x10, $v8, $v10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:gpr = IMPLICIT_DEF + ; CHECK-NEXT: undef [[PseudoVMV_V_I_M1_:%[0-9]+]].sub_vrm1_2:vrn8m1 = PseudoVMV_V_I_M1 undef [[PseudoVMV_V_I_M1_]].sub_vrm1_2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: [[PseudoVMV_V_I_M1_:%[0-9]+]].sub_vrm1_6:vrn8m1 = COPY undef [[PseudoVMV_V_I_M1_]].sub_vrm1_2 + ; CHECK-NEXT: BNE undef [[DEF]], $x0, %bb.3 + ; CHECK-NEXT: PseudoBR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BNE undef [[DEF]], $x0, %bb.3 + ; CHECK-NEXT: PseudoBR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber [[PseudoVMV_V_I_M1_]].sub_vrm1_0:vrn8m1 = PseudoVRGATHER_VI_M1 undef [[PseudoVMV_V_I_M1_]].sub_vrm1_0, [[PseudoVMV_V_I_M1_]].sub_vrm1_2, 0, 0, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoVSSEG6E8_V_M1_MASK [[PseudoVMV_V_I_M1_]].sub_vrm1_0_sub_vrm1_1_sub_vrm1_2_sub_vrm1_3_sub_vrm1_4_sub_vrm1_5, undef [[DEF]], killed undef $v0, 0, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1) + ; CHECK-NEXT: PseudoRET + bb.0: + successors: %bb.3(0x40000000), %bb.1(0x40000000) + liveins: $x10, $v8, $v10 + %0:gpr = IMPLICIT_DEF + %1:vrnov0 = PseudoVMV_V_I_M1 undef %1, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + %2:vrnov0 = IMPLICIT_DEF + undef %3.sub_vrm1_0:vrn6m1nov0 = COPY undef %1 + %3.sub_vrm1_3:vrn6m1nov0 = COPY %2 + %3.sub_vrm1_4:vrn6m1nov0 = COPY undef %1 + BNE undef %0, $x0, %bb.3 + PseudoBR %bb.1 + bb.1: + successors: %bb.3(0x40000000), %bb.2(0x40000000) + BNE killed undef %0, $x0, %bb.3 + PseudoBR %bb.2 + bb.2: + successors: %bb.3(0x80000000) + bb.3: + %4:vr = IMPLICIT_DEF + early-clobber %4:vr = PseudoVRGATHER_VI_M1 undef %4, killed %1, 0, 0, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + undef %5.sub_vrm1_0:vrn6m1 = COPY killed %4 + %5.sub_vrm1_5:vrn6m1 = COPY killed %2 + PseudoVSSEG6E8_V_M1_MASK killed %5, undef %0, killed undef $v0, 0, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1) + PseudoRET +... diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll b/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll new file mode 100644 index 0000000..d9a49a1 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v -O0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64 + +define void @matmul() { +; CHECK-RV64-LABEL: matmul: +; CHECK-RV64: # %bb.0: # %entry +; CHECK-RV64-NEXT: li a0, 0 +; CHECK-RV64-NEXT: vsetvli zero, a0, 512 +; CHECK-RV64-NEXT: sf.vsettm zero, a0 +; CHECK-RV64-NEXT: sf.vtzero.t mt0 +; CHECK-RV64-NEXT: ret +entry: + call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 0, i64 0, i64 3, i64 1) + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.riscv.sf.vtzero.t.i64(i64 immarg, i64, i64, i64 immarg, i64 immarg) #0 diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir new file mode 100644 index 0000000..389283a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir @@ -0,0 +1,523 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \ +# RUN: -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s + +--- | + define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) { + entry: + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2) + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2) + ret void + } + + define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) { + entry: + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2) + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4) + ret void + } + + define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) { + entry: + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2) + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2) + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2) + ret void + } + + define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) { + entry: + %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl) + %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl) + call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl) + ret <vscale x 64 x i8> %1 + } + + define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) { + entry: + %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl) + %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl) + %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl) + call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl) + ret <vscale x 64 x i8> %2 + } + + define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) { + entry: + tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2) + call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn) + ret void + } + + define i64 @vsettnt_max(i64 %vl) { + entry: + %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2) + %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2) + ret i64 %0 + } + + define i64 @single_vsettm(i64 %vl) { + entry: + %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2) + ret i64 %0 + } + + define i64 @single_vsettn(i64 %vl) { + entry: + %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2) + ret i64 %0 + } + + define i64 @single_vsettk(i64 %vl) { + entry: + %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2) + ret i64 %0 + } + + define void @sf_vtzero(i64 %tm, i64 %tn) { + entry: + call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4) + ret void + } + + declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64) + declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64) + declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64) + declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64) + declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64) + declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64) + declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64) + declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64) + declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64) + declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64) +... +--- +name: xsfmm_same_state +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: vrm8 } + - { id: 1, class: vrm8 } + - { id: 2, class: gprnox0 } + - { id: 3, class: gprnox0 } + - { id: 4, class: gprnox0 } +liveins: + - { reg: '$v8m8', virtual-reg: '%0' } + - { reg: '$v8m8', virtual-reg: '%1' } + - { reg: '$x10', virtual-reg: '%2' } + - { reg: '$x11', virtual-reg: '%3' } + - { reg: '$x12', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $v8m8, $v16m8, $x10, $x11, $x12 + ; CHECK-LABEL: name: xsfmm_same_state + ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8 + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET + %4:gprnox0 = COPY $x12 + %3:gprnox0 = COPY $x11 + %2:gprnox0 = COPY $x10 + %1:vrm8 = COPY $v16m8 + %0:vrm8 = COPY $v8m8 + PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm + PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm + PseudoRET +... +--- +name: xsfmm_different_state +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: vrm8 } + - { id: 1, class: vrm8 } + - { id: 2, class: gprnox0 } + - { id: 3, class: gprnox0 } + - { id: 4, class: gprnox0 } +liveins: + - { reg: '$v8m8', virtual-reg: '%0' } + - { reg: '$v8m8', virtual-reg: '%1' } + - { reg: '$x10', virtual-reg: '%2' } + - { reg: '$x11', virtual-reg: '%3' } + - { reg: '$x12', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $v8m8, $v16m8, $x10, $x11, $x12 + ; CHECK-LABEL: name: xsfmm_different_state + ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8 + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET + %4:gprnox0 = COPY $x12 + %3:gprnox0 = COPY $x11 + %2:gprnox0 = COPY $x10 + %1:vrm8 = COPY $v16m8 + %0:vrm8 = COPY $v8m8 + PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm + PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm + PseudoRET +... +--- +name: xsfmm_different_state_bf +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: vrm8 } + - { id: 1, class: vrm8 } + - { id: 2, class: gprnox0 } + - { id: 3, class: gprnox0 } + - { id: 4, class: gprnox0 } +liveins: + - { reg: '$v8m8', virtual-reg: '%0' } + - { reg: '$v8m8', virtual-reg: '%1' } + - { reg: '$x10', virtual-reg: '%2' } + - { reg: '$x11', virtual-reg: '%3' } + - { reg: '$x12', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $v8m8, $v16m8, $x10, $x11, $x12 + ; CHECK-LABEL: name: xsfmm_different_state_bf + ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8 + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET + %4:gprnox0 = COPY $x12 + %3:gprnox0 = COPY $x11 + %2:gprnox0 = COPY $x10 + %1:vrm8 = COPY $v16m8 + %0:vrm8 = COPY $v8m8 + PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm + PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm + PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm + PseudoRET +... +--- +name: interleave_rvv_and_xsfmm +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: vrm8 } + - { id: 1, class: gprnox0 } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: vrm8 } + - { id: 5, class: vrm8 } +liveins: + - { reg: '$v8m8', virtual-reg: '%0' } + - { reg: '$x10', virtual-reg: '%1' } + - { reg: '$x11', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $v8m8, $x10, $x11 + ; CHECK-LABEL: name: interleave_rvv_and_xsfmm + ; CHECK: liveins: $v8m8, $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8 + ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1 + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %2:gpr = COPY $x11 + %1:gprnox0 = COPY $x10 + %0:vrm8 = COPY $v8m8 + %3:gpr = ADDI $x0, 1 + %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1 + %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0 + PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1 + $v8m8 = COPY %5:vrm8 + PseudoRET implicit $v8m8 +... +--- +name: interleave_rvv_and_xsfmm2 +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: vrm8 } + - { id: 1, class: gprnox0 } + - { id: 2, class: gpr } + - { id: 3, class: gpr } + - { id: 4, class: vrm8 } + - { id: 5, class: vrm8 } +liveins: + - { reg: '$v8m8', virtual-reg: '%0' } + - { reg: '$x10', virtual-reg: '%1' } + - { reg: '$x11', virtual-reg: '%2' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $v8m8, $x10, $x11 + ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2 + ; CHECK: liveins: $v8m8, $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8 + ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1 + ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype + ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype + ; CHECK-NEXT: PseudoRET implicit $v8m8 + %2:gpr = COPY $x11 + %1:gprnox0 = COPY $x10 + %0:vrm8 = COPY $v8m8 + %3:gpr = ADDI $x0, 1 + %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0 + %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1 + %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0 + PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1 + $v8m8 = COPY %6:vrm8 + PseudoRET implicit $v8m8 +... +--- +name: consecutive_xsfmm +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: vrm8 } + - { id: 1, class: gprnox0 } + - { id: 2, class: gprnox0 } + - { id: 3, class: gprnox0 } + - { id: 4, class: gprnox0 } +liveins: + - { reg: '$v8m8', virtual-reg: '%0' } + - { reg: '$x10', virtual-reg: '%1' } + - { reg: '$x11', virtual-reg: '%2' } + - { reg: '$x12', virtual-reg: '%3' } + - { reg: '$x13', virtual-reg: '%4' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $v8m8, $x10, $x11, $x12, $x13 + ; CHECK-LABEL: name: consecutive_xsfmm + ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12 + ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13 + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET + %0:vrm8 = COPY $v8m8 + %1:gprnox0 = COPY $x10 + %2:gprnox0 = COPY $x11 + %3:gprnox0 = COPY $x12 + %4:gprnox0 = COPY $x13 + PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm + PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1 + PseudoRET +... +--- +name: vsettnt_max +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnox0 } +liveins: + - { reg: '$x10', virtual-reg: '%0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x10 + ; CHECK-LABEL: name: vsettnt_max + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype + ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype + ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]] + ; CHECK-NEXT: PseudoRET implicit $x10 + %0:gprnox0 = COPY $x10 + %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype + %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype + %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype + $x10 = COPY %3:gprnox0 + PseudoRET implicit $x10 +... +--- +name: single_vsettm +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnox0 } +liveins: + - { reg: '$x10', virtual-reg: '%0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x10 + ; CHECK-LABEL: name: single_vsettm + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype + ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]] + ; CHECK-NEXT: PseudoRET implicit $x10 + %0:gprnox0 = COPY $x10 + %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype + $x10 = COPY %1:gprnox0 + PseudoRET implicit $x10 +... +--- +name: single_vsettn +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnox0 } +liveins: + - { reg: '$x10', virtual-reg: '%0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x10 + ; CHECK-LABEL: name: single_vsettn + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]] + ; CHECK-NEXT: PseudoRET implicit $x10 + %0:gprnox0 = COPY $x10 + %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype + $x10 = COPY %1:gprnox0 + PseudoRET implicit $x10 +... +--- +name: single_vsettk +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnox0 } +liveins: + - { reg: '$x10', virtual-reg: '%0' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x10 + ; CHECK-LABEL: name: single_vsettk + ; CHECK: liveins: $x10 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype + ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]] + ; CHECK-NEXT: PseudoRET implicit $x10 + %0:gprnox0 = COPY $x10 + %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype + $x10 = COPY %1:gprnox0 + PseudoRET implicit $x10 +... +--- +name: sf_vtzero +alignment: 4 +tracksRegLiveness: true +registers: + - { id: 0, class: gprnox0 } + - { id: 1, class: gprnox0 } +liveins: + - { reg: '$x10', virtual-reg: '%0' } + - { reg: '$x11', virtual-reg: '%1' } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x10, $x11 + ; CHECK-LABEL: name: sf_vtzero + ; CHECK: liveins: $x10, $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11 + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype + ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET + %0:gprnox0 = COPY $x10 + %1:gprnox0 = COPY $x11 + PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4 + PseudoRET +... diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll new file mode 100644 index 0000000..9b9a849 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.e4m3.e4m3.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.e4m3.e4m3 mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.e4m3.e4m3.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll new file mode 100644 index 0000000..b63974f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.e4m3.e5m2.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.e4m3.e5m2 mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.e4m3.e5m2.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll new file mode 100644 index 0000000..62d629b1 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.e5m2.e4m3.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.e5m2.e4m3 mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.e5m2.e4m3.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll new file mode 100644 index 0000000..7a90c97 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.e5m2.e5m2.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.e5m2.e5m2 mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.e5m2.e5m2.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll new file mode 100644 index 0000000..29451c6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+xsfmm32a32f -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+xsfmm32a32f -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv32f16(iXLen, <vscale x 32 x half>, <vscale x 32 x half>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_f_f_w2_f16m8(iXLen %mtd, <vscale x 32 x half> %v1, <vscale x 32 x half> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_f_f_w2_f16m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e16, w2 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.f.f mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.f.f.iXLen.nxv32f16(iXLen 0, <vscale x 32 x half> %v1, <vscale x 32 x half> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 2) + ret void +} + +declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv16f32(iXLen, <vscale x 16 x float>, <vscale x 16 x float>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_f_f_w1_f32m8(iXLen %mtd, <vscale x 16 x float> %v1, <vscale x 16 x float> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_f_f_w1_f32m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e32, w1 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.f.f mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.f.f.iXLen.nxv16f32(iXLen 0, <vscale x 16 x float> %v1, <vscale x 16 x float> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 1) + ret void +} + +declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv8f64(iXLen, <vscale x 8 x double>, <vscale x 8 x double>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_f_f_w1_f64m8(iXLen %mtd, <vscale x 8 x double> %v1, <vscale x 8 x double> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_f_f_w1_f64m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e64, w1 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.f.f mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.f.f.iXLen.nxv8f64(iXLen 0, <vscale x 8 x double> %v1, <vscale x 8 x double> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 1) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll new file mode 100644 index 0000000..6a4b29f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.s.s.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_s_s_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_s_s_w4_i8m8_i8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.s.s mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.s.s.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll new file mode 100644 index 0000000..79239b0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.s.u.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_s_u_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_s_u_w4_i8m8_i8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.s.u mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.s.u.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll new file mode 100644 index 0000000..b0d039b --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.u.s.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_u_s_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_u_s_w4_i8m8_i8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.u.s mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.u.s.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll new file mode 100644 index 0000000..913c277 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.mm.u.u.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen) + +define void @test_sf_mm_u_u_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) { +; CHECK-LABEL: test_sf_mm_u_u_w4_i8m8_i8m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a1 +; CHECK-NEXT: sf.vsettk zero, a3 +; CHECK-NEXT: sf.mm.u.u mt0, v8, v16 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.mm.u.u.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll new file mode 100644 index 0000000..8048dec --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vlte16.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vlte16(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vlte16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e16, w1 +; CHECK-NEXT: sf.vlte16 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vlte16.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll new file mode 100644 index 0000000..a526dc8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vlte32.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vlte32(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vlte32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e32, w1 +; CHECK-NEXT: sf.vlte32 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vlte32.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll new file mode 100644 index 0000000..ed0c48a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vlte64.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vlte64(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vlte64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e64, w1 +; CHECK-NEXT: sf.vlte64 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vlte64.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll new file mode 100644 index 0000000..67b3ed2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vlte8.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vlte8(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vlte8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w1 +; CHECK-NEXT: sf.vlte8 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vlte8.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll new file mode 100644 index 0000000..4da37fa --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare iXLen @llvm.riscv.sf.vsettk.iXLen(iXLen, iXLen, iXLen) + +define iXLen @test_sf_vsettk(iXLen %tk) { +; CHECK-LABEL: test_sf_vsettk: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a1, zero, e16, w2 +; CHECK-NEXT: sf.vsettk a0, a0 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettk.iXLen(iXLen %tk, iXLen 1, iXLen 2) + ret iXLen %0 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll new file mode 100644 index 0000000..143c26c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare iXLen @llvm.riscv.sf.vsettm.iXLen(iXLen, iXLen, iXLen) + +define iXLen @test_sf_vsettm(iXLen %tm) { +; CHECK-LABEL: test_sf_vsettm: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a1, zero, e8, w4 +; CHECK-NEXT: sf.vsettm a0, a0 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettm.iXLen(iXLen %tm, iXLen 0, iXLen 3) + ret iXLen %0 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll new file mode 100644 index 0000000..48fa1bc8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen, iXLen, iXLen) + +define iXLen @test_sf_vsettnt_e8w1(iXLen %tn) { +; CHECK-LABEL: test_sf_vsettnt_e8w1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a0, a0, e8, w1 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 1) + ret iXLen %0 +} + +define iXLen @test_sf_vsettnt_e8w2(iXLen %tn) { +; CHECK-LABEL: test_sf_vsettnt_e8w2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a0, a0, e8, w2 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 2) + ret iXLen %0 +} + +define iXLen @test_sf_vsettnt_e8w4(iXLen %tn) { +; CHECK-LABEL: test_sf_vsettnt_e8w4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a0, a0, e8, w4 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 3) + ret iXLen %0 +} + +define iXLen @test_sf_vsettnt_e16w1(iXLen %tn) { +; CHECK-LABEL: test_sf_vsettnt_e16w1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a0, a0, e16, w1 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 1) + ret iXLen %0 +} + +define iXLen @test_sf_vsettnt_e16w2(iXLen %tn) { +; CHECK-LABEL: test_sf_vsettnt_e16w2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a0, a0, e16, w2 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 2) + ret iXLen %0 +} + +define iXLen @test_sf_vsettnt_e16w4(iXLen %tn) { +; CHECK-LABEL: test_sf_vsettnt_e16w4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt a0, a0, e16, w4 +; CHECK-NEXT: ret + entry: + %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 3) + ret iXLen %0 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll new file mode 100644 index 0000000..7a76151 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vste16.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vste16(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vste16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e16, w1 +; CHECK-NEXT: sf.vste16 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vste16.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll new file mode 100644 index 0000000..8ff6e6a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vste32.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vste32(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vste32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e32, w1 +; CHECK-NEXT: sf.vste32 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vste32.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll new file mode 100644 index 0000000..53990e4 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vste64.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vste64(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vste64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e64, w1 +; CHECK-NEXT: sf.vste64 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vste64.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll new file mode 100644 index 0000000..09b7259 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vste8.iXLen(iXLen, ptr, iXLen) + +define dso_local void @test_sf_vste8(iXLen %tss, ptr %base, iXLen %vl) { +; CHECK-LABEL: test_sf_vste8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a2, e8, w1 +; CHECK-NEXT: sf.vste8 a0, (a1) +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vste8.iXLen(iXLen %tss, ptr %base, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll new file mode 100644 index 0000000..394eb60 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vtdiscard() + +define dso_local void @test_sf_vtdiscard() { +; CHECK-LABEL: test_sf_vtdiscard: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vtdiscard +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtdiscard() + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll new file mode 100644 index 0000000..66c9d26 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.iXLen(iXLen, <vscale x 32 x bfloat>, iXLen) + +define void @test_sf_vtmv_t_v_bf16m8(iXLen %tss, <vscale x 32 x bfloat> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_bf16m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.iXLen(iXLen %tss, <vscale x 32 x bfloat> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv32f16.iXLen(iXLen, <vscale x 32 x half>, iXLen) + +define void @test_sf_vtmv_t_v_f16(iXLen %tss, <vscale x 32 x half> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv32f16.iXLen(iXLen %tss, <vscale x 32 x half> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv16f32.iXLen(iXLen, <vscale x 16 x float>, iXLen) + +define void @test_sf_vtmv_t_v_f32(iXLen %tss, <vscale x 16 x float> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv16f32.iXLen(iXLen %tss, <vscale x 16 x float> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv8f64.iXLen(iXLen, <vscale x 8 x double>, iXLen) + +define void @test_sf_vtmv_t_v_f64(iXLen %tss, <vscale x 8 x double> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv8f64.iXLen(iXLen %tss, <vscale x 8 x double> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv64i8.iXLen(iXLen, <vscale x 64 x i8>, iXLen) + +define void @test_sf_vtmv_t_v_i8(iXLen %tss, <vscale x 64 x i8> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e8, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.iXLen(iXLen %tss, <vscale x 64 x i8> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv32i16.iXLen(iXLen, <vscale x 32 x i16>, iXLen) + +define void @test_sf_vtmv_t_v_i16(iXLen %tss, <vscale x 32 x i16> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.iXLen(iXLen %tss, <vscale x 32 x i16> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv16i32.iXLen(iXLen, <vscale x 16 x i32>, iXLen) + +define void @test_sf_vtmv_t_v_i32(iXLen %tss, <vscale x 16 x i32> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.iXLen(iXLen %tss, <vscale x 16 x i32> %src, iXLen %vl) + ret void +} + +declare void @llvm.riscv.sf.vtmv.t.v.nxv8i64.iXLen(iXLen, <vscale x 8 x i64>, iXLen) + +define void @test_sf_vtmv_t_v_i64(iXLen %tss, <vscale x 8 x i64> %src, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_t_v_i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1 +; CHECK-NEXT: sf.vtmv.t.v a0, v8 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.iXLen(iXLen %tss, <vscale x 8 x i64> %src, iXLen %vl) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll new file mode 100644 index 0000000..0dcc2ab --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.iXLen(iXLen, iXLen) + +define <vscale x 32 x bfloat> @test_sf_vtmv_v_t_bf16m8(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_bf16m8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 32 x bfloat> %0 +} + +declare <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.iXLen(iXLen, iXLen) + +define <vscale x 32 x half> @test_sf_vtmv_v_t_f16(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 32 x half> %0 +} + +declare <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.iXLen(iXLen, iXLen) + +define <vscale x 16 x float> @test_sf_vtmv_v_t_f32(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 16 x float> %0 +} + +declare <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.iXLen(iXLen, iXLen) + +define <vscale x 8 x double> @test_sf_vtmv_v_t_f64(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 8 x double> %0 +} + +declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.iXLen(iXLen, iXLen) + +define <vscale x 64 x i8> @test_sf_vtmv_v_t_i8(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e8, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 64 x i8> %0 +} + +declare <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.iXLen(iXLen, iXLen) + +define <vscale x 32 x i16> @test_sf_vtmv_v_t_i16(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e16, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 32 x i16> %0 +} + +declare <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.iXLen(iXLen, iXLen) + +define <vscale x 16 x i32> @test_sf_vtmv_v_t_i32(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e32, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 16 x i32> %0 +} + +declare <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.iXLen(iXLen, iXLen) + +define <vscale x 8 x i64> @test_sf_vtmv_v_t_i64(iXLen %tss, iXLen %vl) { +; CHECK-LABEL: test_sf_vtmv_v_t_i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e64, w1 +; CHECK-NEXT: sf.vtmv.v.t v8, a0 +; CHECK-NEXT: ret + entry: + %0 = call <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.iXLen(iXLen %tss, iXLen %vl) + ret <vscale x 8 x i64> %0 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll new file mode 100644 index 0000000..bbccb02 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \ +; RUN: -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \ +; RUN: -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK + +declare void @llvm.riscv.sf.vtzero.t.iXLen(iXLen, iXLen, iXLen, iXLen, iXLen) +define void @test_sf_vtzero_t(iXLen %tm, iXLen %tn) { +; CHECK-LABEL: test_sf_vtzero_t: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sf.vsettnt zero, a1, e8, w4 +; CHECK-NEXT: sf.vsettm zero, a0 +; CHECK-NEXT: sf.vtzero.t mt0 +; CHECK-NEXT: ret + entry: + call void @llvm.riscv.sf.vtzero.t.iXLen(iXLen 0, iXLen %tm, iXLen %tn, iXLen 3, iXLen 4) + ret void +} + diff --git a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll index 2f03ff9..318268a 100644 --- a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll +++ b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll @@ -15,8 +15,7 @@ define i32 @from_cmpeq(i32 %xx, i32 %y) { ; ; RV64I-LABEL: from_cmpeq: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a0, a0, -9 +; RV64I-NEXT: addiw a0, a0, -9 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: ret @@ -39,8 +38,7 @@ define i32 @from_cmpeq_fail_bad_andmask(i32 %xx, i32 %y) { ; ; RV64I-LABEL: from_cmpeq_fail_bad_andmask: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a0, a0, -9 +; RV64I-NEXT: addiw a0, a0, -9 ; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: addi a0, a0, -1 ; RV64I-NEXT: and a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/setcc-logic.ll b/llvm/test/CodeGen/RISCV/setcc-logic.ll index fabb573..4e14893 100644 --- a/llvm/test/CodeGen/RISCV/setcc-logic.ll +++ b/llvm/test/CodeGen/RISCV/setcc-logic.ll @@ -104,9 +104,8 @@ define i1 @and_icmps_const_not1bit_diff(i32 %x) nounwind { ; ; RV64I-LABEL: and_icmps_const_not1bit_diff: ; RV64I: # %bb.0: -; RV64I-NEXT: sext.w a0, a0 -; RV64I-NEXT: addi a1, a0, -44 -; RV64I-NEXT: addi a0, a0, -92 +; RV64I-NEXT: addiw a1, a0, -44 +; RV64I-NEXT: addiw a0, a0, -92 ; RV64I-NEXT: snez a1, a1 ; RV64I-NEXT: snez a0, a0 ; RV64I-NEXT: and a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll index bdbe4ed..07bfbe6 100644 --- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll +++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll @@ -674,8 +674,7 @@ define i32 @sext_of_not_cmp_i32(i32 %x) { ; ; RV64-LABEL: sext_of_not_cmp_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: addi a0, a0, -7 +; RV64-NEXT: addiw a0, a0, -7 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: ret @@ -718,8 +717,7 @@ define i32 @dec_of_zexted_cmp_i32(i32 %x) { ; ; RV64-LABEL: dec_of_zexted_cmp_i32: ; RV64: # %bb.0: -; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: addi a0, a0, -7 +; RV64-NEXT: addiw a0, a0, -7 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index 2751332c..bf6802d 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -1047,8 +1047,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) { ; RV64-LABEL: usubo.i32.constant.lhs: ; RV64: # %bb.0: # %entry ; RV64-NEXT: li a2, -2 -; RV64-NEXT: subw a2, a2, a0 -; RV64-NEXT: addi a0, a2, 1 +; RV64-NEXT: sub a2, a2, a0 +; RV64-NEXT: addiw a0, a2, 1 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret @@ -1065,8 +1065,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) { ; RV64ZBA-LABEL: usubo.i32.constant.lhs: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: li a2, -2 -; RV64ZBA-NEXT: subw a2, a2, a0 -; RV64ZBA-NEXT: addi a0, a2, 1 +; RV64ZBA-NEXT: sub a2, a2, a0 +; RV64ZBA-NEXT: addiw a0, a2, 1 ; RV64ZBA-NEXT: seqz a0, a0 ; RV64ZBA-NEXT: sw a2, 0(a1) ; RV64ZBA-NEXT: ret @@ -1083,8 +1083,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) { ; RV64ZICOND-LABEL: usubo.i32.constant.lhs: ; RV64ZICOND: # %bb.0: # %entry ; RV64ZICOND-NEXT: li a2, -2 -; RV64ZICOND-NEXT: subw a2, a2, a0 -; RV64ZICOND-NEXT: addi a0, a2, 1 +; RV64ZICOND-NEXT: sub a2, a2, a0 +; RV64ZICOND-NEXT: addiw a0, a2, 1 ; RV64ZICOND-NEXT: seqz a0, a0 ; RV64ZICOND-NEXT: sw a2, 0(a1) ; RV64ZICOND-NEXT: ret diff --git a/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll new file mode 100644 index 0000000..ddc2585 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll @@ -0,0 +1,19 @@ +; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; Verify that llvm.compiler.used is not lowered. +; CHECK: OpName %{{[0-9]+}} "unused" +; CHECK-NOT: OpName %{{[0-9]+}} "llvm.compiler.used" + +; Check that the type of llvm.compiler.used is not emitted too. +; CHECK-NOT: OpTypeArray + +@unused = private addrspace(3) global i32 0 +@llvm.compiler.used = appending addrspace(2) global [1 x ptr addrspace (4)] [ptr addrspace(4) addrspacecast (ptr addrspace(3) @unused to ptr addrspace(4))] + +define spir_func void @foo() { +entry: + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll index c6ee804..07fbed9 100644 --- a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll @@ -90,7 +90,7 @@ define i32 @test_tbegin_nofloat4(i32 %pad, ptr %ptr) { ; CHECK: tbegin 0, 65292 ; CHECK: ipm %r2 ; CHECK: srl %r2, 28 -; CHECK: ciblh %r2, 2, 0(%r14) +; CHECK: bnhr %r14 ; CHECK: mvhi 0(%r3), 0 ; CHECK: br %r14 %res = call i32 @llvm.s390.tbegin.nofloat(ptr null, i32 65292) @@ -219,7 +219,7 @@ define i32 @test_tend2(i32 %pad, ptr %ptr) { ; CHECK: tend ; CHECK: ipm %r2 ; CHECK: srl %r2, 28 -; CHECK: ciblh %r2, 2, 0(%r14) +; CHECK: bnhr %r14 ; CHECK: mvhi 0(%r3), 0 ; CHECK: br %r14 %res = call i32 @llvm.s390.tend() diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll new file mode 100644 index 0000000..6b8746e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll @@ -0,0 +1,738 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s +; Test implementation of combining br_ccmask for flag output operand, and +; optimizing ipm sequence using conditional branches. + +declare void @dummy() + +; Check a case where the cc is used as an integer. +; Just (srl (ipm)) sequence without optimization. +define i32 @test(ptr %a) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ipm %r2 +; CHECK-NEXT: srl %r2, 28 +; CHECK-NEXT: br %r14 + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + ret i32 %cc +} + +; Test-1(f1_0_*). Test all 14 valid combinations, where cc is being used for +; branching. + +; Check (cc == 0). +define void @f1_0_eq_0(ptr %a) { +; CHECK-LABEL: f1_0_eq_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jge dummy@PLT +; CHECK-NEXT: .LBB1_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp eq i32 %cc, 0 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc != 0). +define void @f1_0_ne_0(ptr %a) { +; CHECK-LABEL: f1_0_ne_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgne dummy@PLT +; CHECK-NEXT: .LBB2_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp ugt i32 %cc, 0 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 1). +define void @f1_0_eq_1(ptr %a) { +; CHECK-LABEL: f1_0_eq_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgl dummy@PLT +; CHECK-NEXT: .LBB3_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp eq i32 %cc, 1 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc != 1). +define void @f1_0_ne_1(ptr %a) { +; CHECK-LABEL: f1_0_ne_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnl dummy@PLT +; CHECK-NEXT: .LBB4_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp ne i32 %cc, 1 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 2). +define void @f1_0_eq_2(ptr %a) { +; CHECK-LABEL: f1_0_eq_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgh dummy@PLT +; CHECK-NEXT: .LBB5_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp eq i32 %cc, 2 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc != 2). +define void @f1_0_ne_2(ptr %a) { +; CHECK-LABEL: f1_0_ne_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnh dummy@PLT +; CHECK-NEXT: .LBB6_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp ne i32 %cc, 2 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 3). +define void @f1_0_eq_3(ptr %a) { +; CHECK-LABEL: f1_0_eq_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgo dummy@PLT +; CHECK-NEXT: .LBB7_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp eq i32 %cc, 3 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc != 3). +define void @f1_0_ne_3(ptr %a) { +; CHECK-LABEL: f1_0_ne_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgno dummy@PLT +; CHECK-NEXT: .LBB8_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp ult i32 %cc, 3 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 0|1). +define void @f1_0_01(ptr %a) { +; CHECK-LABEL: f1_0_01: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgle dummy@PLT +; CHECK-NEXT: .LBB9_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp ult i32 %cc, 2 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 0|2). +define void @f1_0_02(ptr %a) { +; CHECK-LABEL: f1_0_02: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jghe dummy@PLT +; CHECK-NEXT: .LBB10_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 0|3). +define void @f1_0_03(ptr %a) { +; CHECK-LABEL: f1_0_03: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnlh dummy@PLT +; CHECK-NEXT: .LBB11_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp0 = icmp ne i32 %cc, 0 + %cmp3 = icmp ne i32 %cc, 3 + %cmp.inv = and i1 %cmp0, %cmp3 + br i1 %cmp.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 1|2). +define void @f1_0_12(ptr %a) { +; CHECK-LABEL: f1_0_12: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jglh dummy@PLT +; CHECK-NEXT: .LBB12_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq1 = icmp eq i32 %cc, 1 + %cmpeq2 = icmp eq i32 %cc, 2 + %cmp = or i1 %cmpeq1, %cmpeq2 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 1|3). +define void @f1_0_13(ptr %a) { +; CHECK-LABEL: f1_0_13: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnhe dummy@PLT +; CHECK-NEXT: .LBB13_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq1 = icmp eq i32 %cc, 1 + %cmpeq3 = icmp eq i32 %cc, 3 + %cmp = or i1 %cmpeq1, %cmpeq3 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check (cc == 2|3). +define void @f1_0_23(ptr %a) { +; CHECK-LABEL: f1_0_23: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnle dummy@PLT +; CHECK-NEXT: .LBB14_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmp = icmp ugt i32 %cc, 1 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Test-2(f1_1_*/f1_2_*/fl_3_*/f1_4_*). +; Test Mixed patterns involving Binary Ops. + +; Check 'add' for (cc != 0). +define void @f1_1_1(ptr %a) { +; CHECK-LABEL: f1_1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgne dummy@PLT +; CHECK-NEXT: .LBB15_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cmp = icmp ult i32 %add, 3 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'add' for (cc == 1|2). +define void @f1_1_2(ptr %a) { +; CHECK-LABEL: f1_1_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jglh dummy@PLT +; CHECK-NEXT: .LBB16_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cmp = icmp ult i32 %add, 2 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'add' for (cc == 1|2). +define void @f1_1_3(ptr %a) { +; CHECK-LABEL: f1_1_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jglh dummy@PLT +; CHECK-NEXT: .LBB17_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -3 + %cmp.inv = icmp ult i32 %add, -2 + br i1 %cmp.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'and' with one operand cc and other select_ccmask(cc !=1). +define void @f1_2_1(ptr %a) { +; CHECK-LABEL: f1_2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnl dummy@PLT +; CHECK-NEXT: .LBB18_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpne0 = icmp ne i32 %andcc, 0 + %cmpne3 = icmp ne i32 %cc, 3 + %cmp.inv = and i1 %cmpne3, %cmpne0 + br i1 %cmp.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'and' with both operands select_ccmask(cc != 2). +define void @f1_2_2(ptr %a) { +; CHECK-LABEL: f1_2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnh dummy@PLT +; CHECK-NEXT: .LBB19_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %ugt1 = icmp samesign ugt i32 %cc, 1 + %cmpne3 = icmp ne i32 %cc, 3 + %and.cond.inv = and i1 %ugt1, %cmpne3 + br i1 %and.cond.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'and/tm' for (cc == 0|2). +define void @f1_2_3(ptr %a) { +; CHECK-LABEL: f1_2_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jghe dummy@PLT +; CHECK-NEXT: .LBB20_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'and/tm' for (cc == 1|3). +define void @f1_2_4(ptr %a) { +; CHECK-LABEL: f1_2_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnhe dummy@PLT +; CHECK-NEXT: .LBB21_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1). +define void @f1_2_5(ptr %a) { +; CHECK-LABEL: f1_2_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnl dummy@PLT +; CHECK-NEXT: .LBB22_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne3 = icmp ne i32 %cc, 3 + %cmp = xor i1 %cmpne3, %trunc + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check nested 'xor' cc with select_ccmask(cc != 1). +define void @f1_3_1(ptr %a) { +; CHECK-LABEL: f1_3_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnl dummy@PLT +; CHECK-NEXT: .LBB23_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq0 = icmp eq i32 %cc, 0 + %cmpeq2 = icmp eq i32 %cc, 2 + %xor = xor i1 %cmpeq0, %cmpeq2 + %cmpne3 = icmp ne i32 %cc, 3 + %cmp.inv = xor i1 %cmpne3, %xor + br i1 %cmp.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=1). +define void @f1_3_2(ptr %a) { +; CHECK-LABEL: f1_3_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnl dummy@PLT +; CHECK-NEXT: .LBB24_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpeq3 = icmp eq i32 %cc, 3 + %cmp.inv = xor i1 %cmpeq3, %trunc + br i1 %cmp.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=2). +define void @f1_3_3(ptr %a) { +; CHECK-LABEL: f1_3_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgnh dummy@PLT +; CHECK-NEXT: .LBB25_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne0 = icmp ne i32 %cc, 0 + %cmp.cond.inv = xor i1 %cmpne0, %trunc + br i1 %cmp.cond.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'or' with both operands are select_ccmask one with TM and other with +; ICMP(cc == 1). +define void @f1_4_1(ptr %a) { +; CHECK-LABEL: f1_4_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgl dummy@PLT +; CHECK-NEXT: .LBB26_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpeq0 = icmp eq i32 %andcc, 0 + %cmpeq3 = icmp eq i32 %cc, 3 + %cmp.cond.inv = or i1 %cmpeq3, %cmpeq0 + br i1 %cmp.cond.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'or' for (cc == 0|1). +define void @f1_4_2(ptr %a) { +; CHECK-LABEL: f1_4_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgle dummy@PLT +; CHECK-NEXT: .LBB27_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cmp.inv = icmp samesign ugt i32 %or, -3 + br i1 %cmp.inv, label %exit, label %branch +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + +; Check 'or' for (cc == 0|1). +define void @f1_4_3(ptr %a) { +; CHECK-LABEL: f1_4_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: jgle dummy@PLT +; CHECK-NEXT: .LBB28_1: # %exit +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cmp = icmp samesign ult i32 %or, -2 + br i1 %cmp, label %branch, label %exit +branch: + tail call void @dummy() + br label %exit +exit: + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll new file mode 100644 index 0000000..b9b9a4b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll @@ -0,0 +1,1665 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s +; Test implementation of combining select_ccmask for flag output operand and +; optimizing ipm sequence using conditional branches. + +; Test-1(f2_0_*): Both TrueVal and FalseVal non-const(14-valid CCMask). + +; Check (cc == 0). +define i64 @f2_0_eq_0(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_eq_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ber %r14 +; CHECK-NEXT: .LBB0_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp eq i32 %cc, 0 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc != 0). +define i64 @f2_0_ne_0(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_ne_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bner %r14 +; CHECK-NEXT: .LBB1_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp ugt i32 %cc, 0 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc == 1). +define i64 @f2_0_eq_1(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_eq_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB2_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp eq i32 %cc, 1 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc != 1). +define i64 @f2_0_ne_1(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_ne_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB3_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp ne i32 %cc, 1 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc == 2). +define i64 @f2_0_eq_2(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_eq_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB4_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp eq i32 %cc, 2 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc != 2). +define i64 @f2_0_ne_2(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_ne_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnhr %r14 +; CHECK-NEXT: .LBB5_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp ne i32 %cc, 2 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc == 3). +define i64 @f2_0_eq_3(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_eq_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bor %r14 +; CHECK-NEXT: .LBB6_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp eq i32 %cc, 3 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc != 3). +define i64 @f2_0_ne_3(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_ne_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnor %r14 +; CHECK-NEXT: .LBB7_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp ult i32 %cc, 3 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc == 0|1). +define i64 @f2_0_01(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_01: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bler %r14 +; CHECK-NEXT: .LBB8_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp ult i32 %cc, 2 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc == 0|2). +define i64 @f2_0_02(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_02: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB9_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %and = and i32 %cc, 1 + %cond = icmp eq i32 %and, 0 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check (cc == 0|3). +define i64 @f2_0_03(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_0_03: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blhr %r14 +; CHECK-NEXT: .LBB10_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cmp0 = icmp ne i32 %cc, 0 + %cmp3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %cmp0, %cmp3 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check (cc == 1|2). +define i64 @f2_0_12(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_0_12: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlhr %r14 +; CHECK-NEXT: .LBB11_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %add = add nsw i32 %cc, -3 + %cond.inv = icmp ult i32 %add, -2 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check (cc == 1|3). +define i64 @f2_0_13(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_0_13: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB12_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %and = and i32 %cc, 1 + %cond.inv = icmp eq i32 %and, 0 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check (cc == 2|3). +define i64 @f2_0_23(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_0_23: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnler %r14 +; CHECK-NEXT: .LBB13_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %cond = icmp ugt i32 %cc, 1 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Test-2(f2_1_*/f2_2_*/f2_3_*/f2_4_*). +; Both TrueVal and FalseVal are non-const with mixed patterns involving +; Binary Ops. + +; Check 'add' for (cc != 0). +define i64 @f2_1_1(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bner %r14 +; CHECK-NEXT: .LBB14_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 3 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f2_1_2(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_1_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blhr %r14 +; CHECK-NEXT: .LBB15_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 2 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f2_1_3(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_1_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlhr %r14 +; CHECK-NEXT: .LBB16_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -3 + %cond.inv = icmp ult i32 %add, -2 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'and' with one operand cc and other select_ccmask(cc !=1). +define i64 @f2_2_1(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB17_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpne0 = icmp ne i32 %andcc, 0 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %cmpne3, %cmpne0 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'and' with both operands select_ccmask(cc != 2). +define i64 @f2_2_2(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB18_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %ugt1 = icmp samesign ugt i32 %cc, 1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %ugt1, %cmpne3 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'and/tm' for (cc == 0|2). +define i64 @f2_2_3(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_2_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB19_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond = icmp eq i32 %and, 0 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Check 'and/tm' for (cc == 1|3). +define i64 @f2_2_4(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_2_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB20_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond.inv = icmp eq i32 %and, 0 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1). +define i64 @f2_2_5(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_2_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB21_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond = xor i1 %cmpne3, %trunc + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + + +; Check nested 'xor' cc with select_ccmask(cc != 1). +define i64 @f2_3_1(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_3_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB22_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq0 = icmp eq i32 %cc, 0 + %cmpeq2 = icmp eq i32 %cc, 2 + %xor = xor i1 %cmpeq0, %cmpeq2 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = xor i1 %cmpne3, %xor + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=1). +define i64 @f2_3_2(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_3_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB23_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = xor i1 %cmpeq3, %trunc + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=2). +define i64 @f2_3_3(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_3_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB24_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne0 = icmp ne i32 %cc, 0 + %cond.inv = xor i1 %cmpne0, %trunc + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1). +define i64 @f2_4_1(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_4_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB25_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpeq0 = icmp eq i32 %andcc, 0 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = or i1 %cmpeq3, %cmpeq0 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f2_4_2(i64 %y, i64 %x, ptr %a) { +; CHECK-LABEL: f2_4_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnler %r14 +; CHECK-NEXT: .LBB26_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond.inv = icmp samesign ugt i32 %or, -3 + %res = select i1 %cond.inv, i64 %y, i64 %x + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f2_4_3(i64 %x, i64 %y, ptr %a) { +; CHECK-LABEL: f2_4_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r4), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bler %r14 +; CHECK-NEXT: .LBB27_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond = icmp samesign ult i32 %or, -2 + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +; Test-3(f3_1_*/f3_2_*/f3_3_*/f3_4_*). +; TrueVal is non-const and FalseVal is const with mixed patterns involving +; Binary Ops. + +; Check 'add' for (cc != 0). +define i64 @f3_1_1(i64 %x, ptr %a) { +; CHECK-LABEL: f3_1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bner %r14 +; CHECK-NEXT: .LBB28_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 3 + %res = select i1 %cond, i64 %x, i64 5 + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f3_1_2(i64 %x, ptr %a) { +; CHECK-LABEL: f3_1_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blhr %r14 +; CHECK-NEXT: .LBB29_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 2 + %res = select i1 %cond, i64 %x, i64 5 + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f3_1_3(ptr %a, i64 %x) { +; CHECK-LABEL: f3_1_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bnlhr %r14 +; CHECK-NEXT: .LBB30_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -3 + %cond.inv = icmp ult i32 %add, -2 + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'and' with one operand cc and other select_ccmask(cc !=1). +define i64 @f3_2_1(ptr %a, i64 %x) { +; CHECK-LABEL: f3_2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB31_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpne0 = icmp ne i32 %andcc, 0 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %cmpne3, %cmpne0 + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'and' with both operands select_ccmask(cc != 2). +define i64 @f3_2_2(ptr %a, i64 %x) { +; CHECK-LABEL: f3_2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB32_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %ugt1 = icmp samesign ugt i32 %cc, 1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %ugt1, %cmpne3 + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'and/tm' for (cc == 0|2). +define i64 @f3_2_3(i64 %x, ptr %a) { +; CHECK-LABEL: f3_2_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB33_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond = icmp eq i32 %and, 0 + %res = select i1 %cond, i64 %x, i64 5 + ret i64 %res +} + +; Check 'and/tm' for (cc == 1|3). +define i64 @f3_2_4(ptr %a, i64 %x) { +; CHECK-LABEL: f3_2_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB34_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond.inv = icmp eq i32 %and, 0 + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1). +define i64 @f3_2_5(i64 %x, ptr %a) { +; CHECK-LABEL: f3_2_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB35_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond = xor i1 %cmpne3, %trunc + %res = select i1 %cond, i64 %x, i64 5 + ret i64 %res +} + + +; Check nested 'xor' cc with select_ccmask(cc != 1). +define i64 @f3_3_1(ptr %a, i64 %x) { +; CHECK-LABEL: f3_3_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB36_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq0 = icmp eq i32 %cc, 0 + %cmpeq2 = icmp eq i32 %cc, 2 + %xor = xor i1 %cmpeq0, %cmpeq2 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = xor i1 %cmpne3, %xor + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=1). +define i64 @f3_3_2(ptr %a, i64 %x) { +; CHECK-LABEL: f3_3_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB37_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = xor i1 %cmpeq3, %trunc + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=2). +define i64 @f3_3_3(ptr %a, i64 %x) { +; CHECK-LABEL: f3_3_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB38_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne0 = icmp ne i32 %cc, 0 + %cond.inv = xor i1 %cmpne0, %trunc + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1). +define i64 @f3_4_1(ptr %a, i64 %x) { +; CHECK-LABEL: f3_4_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB39_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpeq0 = icmp eq i32 %andcc, 0 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = or i1 %cmpeq3, %cmpeq0 + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f3_4_2(ptr %a, i64 %x) { +; CHECK-LABEL: f3_4_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bnler %r14 +; CHECK-NEXT: .LBB40_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond.inv = icmp samesign ugt i32 %or, -3 + %res = select i1 %cond.inv, i64 5, i64 %x + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f3_4_3(i64 %x, ptr %a) { +; CHECK-LABEL: f3_4_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bler %r14 +; CHECK-NEXT: .LBB41_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond = icmp samesign ult i32 %or, -2 + %res = select i1 %cond, i64 %x, i64 5 + ret i64 %res +} + + +; Test-4(f4_1_*/f4_2_*/f4_3_*/f4_4_*). +; TrueVal is const and FalseVal is non-const with mixed patterns involving +; Binary Ops. + +; Check 'add' for (cc != 0). +define i64 @f4_1_1(ptr %a, i64 %y) { +; CHECK-LABEL: f4_1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bner %r14 +; CHECK-NEXT: .LBB42_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 3 + %res = select i1 %cond, i64 15, i64 %y + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f4_1_2(ptr %a, i64 %y) { +; CHECK-LABEL: f4_1_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: blhr %r14 +; CHECK-NEXT: .LBB43_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 2 + %res = select i1 %cond, i64 15, i64 %y + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f4_1_3(i64 %y, ptr %a) { +; CHECK-LABEL: f4_1_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlhr %r14 +; CHECK-NEXT: .LBB44_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -3 + %cond.inv = icmp ult i32 %add, -2 + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'and' with one operand cc and other select_ccmask(cc !=1). +define i64 @f4_2_1(i64 %y, ptr %a) { +; CHECK-LABEL: f4_2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB45_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpne0 = icmp ne i32 %andcc, 0 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %cmpne3, %cmpne0 + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'and' with both operands select_ccmask(cc != 2). +define i64 @f4_2_2(i64 %y, ptr %a) { +; CHECK-LABEL: f4_2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB46_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %ugt1 = icmp samesign ugt i32 %cc, 1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %ugt1, %cmpne3 + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'and/tm' for (cc == 0|2). +define i64 @f4_2_3(ptr %a, i64 %y) { +; CHECK-LABEL: f4_2_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB47_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond = icmp eq i32 %and, 0 + %res = select i1 %cond, i64 15, i64 %y + ret i64 %res +} + +; Check 'and/tm' for (cc == 1|3). +define i64 @f4_2_4(i64 %y, ptr %a) { +; CHECK-LABEL: f4_2_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB48_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond.inv = icmp eq i32 %and, 0 + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1). +define i64 @f4_2_5(ptr %a, i64 %y) { +; CHECK-LABEL: f4_2_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB49_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond = xor i1 %cmpne3, %trunc + %res = select i1 %cond, i64 15, i64 %y + ret i64 %res +} + + +; Check nested 'xor' cc with select_ccmask(cc != 1). +define i64 @f4_3_1(i64 %y, ptr %a) { +; CHECK-LABEL: f4_3_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB50_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq0 = icmp eq i32 %cc, 0 + %cmpeq2 = icmp eq i32 %cc, 2 + %xor = xor i1 %cmpeq0, %cmpeq2 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = xor i1 %cmpne3, %xor + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=1). +define i64 @f4_3_2(i64 %y, ptr %a) { +; CHECK-LABEL: f4_3_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB51_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = xor i1 %cmpeq3, %trunc + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=2). +define i64 @f4_3_3(i64 %y, ptr %a) { +; CHECK-LABEL: f4_3_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB52_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne0 = icmp ne i32 %cc, 0 + %cond.inv = xor i1 %cmpne0, %trunc + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1). +define i64 @f4_4_1(i64 %y,ptr %a) { +; CHECK-LABEL: f4_4_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB53_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpeq0 = icmp eq i32 %andcc, 0 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = or i1 %cmpeq3, %cmpeq0 + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f4_4_2(i64 %y, ptr %a) { +; CHECK-LABEL: f4_4_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r3), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: bnler %r14 +; CHECK-NEXT: .LBB54_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond.inv = icmp samesign ugt i32 %or, -3 + %res = select i1 %cond.inv, i64 %y, i64 15 + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f4_4_3(ptr %a, i64 %y) { +; CHECK-LABEL: f4_4_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bler %r14 +; CHECK-NEXT: .LBB55_1: # %entry +; CHECK-NEXT: lgr %r2, %r3 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond = icmp samesign ult i32 %or, -2 + %res = select i1 %cond, i64 15, i64 %y + ret i64 %res +} + +; Test-5(f5_1_*/f5_2_*/f5_3_*/f5_4_*). +; Both TrueVal and FalseVal are const with mixed patterns involving +; Binary Ops. + + +; Check 'add' for (cc != 0). +define i64 @f5_1_1(ptr %a) { +; CHECK-LABEL: f5_1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bner %r14 +; CHECK-NEXT: .LBB56_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 3 + %res = select i1 %cond, i64 15, i64 5 + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f5_1_2(ptr %a) { +; CHECK-LABEL: f5_1_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: blhr %r14 +; CHECK-NEXT: .LBB57_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -1 + %cond = icmp ult i32 %add, 2 + %res = select i1 %cond, i64 15, i64 5 + ret i64 %res +} + +; Check 'add' for (cc == 1|2). +define i64 @f5_1_3(ptr %a) { +; CHECK-LABEL: f5_1_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bnlhr %r14 +; CHECK-NEXT: .LBB58_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %add = add nsw i32 %cc, -3 + %cond.inv = icmp ult i32 %add, -2 + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'and' with one operand cc and other select_ccmask(cc !=1). +define i64 @f5_2_1(ptr %a) { +; CHECK-LABEL: f5_2_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB59_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpne0 = icmp ne i32 %andcc, 0 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %cmpne3, %cmpne0 + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'and' with both operands select_ccmask(cc != 2). +define i64 @f5_2_2(ptr %a) { +; CHECK-LABEL: f5_2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB60_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %ugt1 = icmp samesign ugt i32 %cc, 1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = and i1 %ugt1, %cmpne3 + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'and/tm' for (cc == 0|2). +define i64 @f5_2_3(ptr %a) { +; CHECK-LABEL: f5_2_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB61_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond = icmp eq i32 %and, 0 + %res = select i1 %cond, i64 15, i64 5 + ret i64 %res +} + +; Check 'and/tm' for (cc == 1|3). +define i64 @f5_2_4(ptr %a) { +; CHECK-LABEL: f5_2_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB62_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %and = and i32 %cc, 1 + %cond.inv = icmp eq i32 %and, 0 + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1). +define i64 @f5_2_5(ptr %a) { +; CHECK-LABEL: f5_2_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB63_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne3 = icmp ne i32 %cc, 3 + %cond = xor i1 %cmpne3, %trunc + %res = select i1 %cond, i64 15, i64 5 + ret i64 %res +} + + +; Check nested 'xor' cc with select_ccmask(cc != 1). +define i64 @f5_3_1(ptr %a) { +; CHECK-LABEL: f5_3_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB64_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %cmpeq0 = icmp eq i32 %cc, 0 + %cmpeq2 = icmp eq i32 %cc, 2 + %xor = xor i1 %cmpeq0, %cmpeq2 + %cmpne3 = icmp ne i32 %cc, 3 + %cond.inv = xor i1 %cmpne3, %xor + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=1). +define i64 @f5_3_2(ptr %a) { +; CHECK-LABEL: f5_3_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: blr %r14 +; CHECK-NEXT: .LBB65_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = xor i1 %cmpeq3, %trunc + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check branching on 'tm' and 'xor' with one operand cc and the other +; select_ccmask(cc !=2). +define i64 @f5_3_3(ptr %a) { +; CHECK-LABEL: f5_3_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bhr %r14 +; CHECK-NEXT: .LBB66_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %trunc = trunc i32 %cc to i1 + %cmpne0 = icmp ne i32 %cc, 0 + %cond.inv = xor i1 %cmpne0, %trunc + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1). +define i64 @f5_4_1(ptr %a) { +; CHECK-LABEL: f5_4_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bnlr %r14 +; CHECK-NEXT: .LBB67_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %andcc = and i32 %cc, 1 + %cmpeq0 = icmp eq i32 %andcc, 0 + %cmpeq3 = icmp eq i32 %cc, 3 + %cond.inv = or i1 %cmpeq3, %cmpeq0 + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f5_4_2(ptr %a) { +; CHECK-LABEL: f5_4_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: bnler %r14 +; CHECK-NEXT: .LBB68_1: # %entry +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond.inv = icmp samesign ugt i32 %or, -3 + %res = select i1 %cond.inv, i64 5, i64 15 + ret i64 %res +} + +; Check 'or' for (cc == 0|1). +define i64 @f5_4_3(ptr %a) { +; CHECK-LABEL: f5_4_3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bler %r14 +; CHECK-NEXT: .LBB69_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %tmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %tmp) + %or = or disjoint i32 %cc, -4 + %cond = icmp samesign ult i32 %or, -2 + %res = select i1 %cond, i64 15, i64 5 + ret i64 %res +} + +; Nested select_ccmask with TrueVal and FalseVal swapped with each other. +define i64 @f6_1(ptr %a) { +; CHECK-LABEL: f6_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: #APP +; CHECK-NEXT: alsi 0(%r2), -1 +; CHECK-EMPTY: +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: lghi %r2, 15 +; CHECK-NEXT: bher %r14 +; CHECK-NEXT: .LBB70_1: # %entry +; CHECK-NEXT: lghi %r2, 5 +; CHECK-NEXT: br %r14 +entry: + %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr elementtype(i32) %a, ptr elementtype(i32) %a) + %cmp = icmp ult i32 %cc, 4 + tail call void @llvm.assume(i1 %cmp) + %andcc = and i32 %cc, 1 + %cmpeq0 = icmp eq i32 %andcc, 0 + %cmpeq3 = icmp eq i32 %cc, 3 + %select = select i1 %cmpeq3, i64 5, i64 15 + %res = select i1 %cmpeq0, i64 %select, i64 5 + ret i64 %res +} + diff --git a/llvm/test/CodeGen/Thumb2/carry.ll b/llvm/test/CodeGen/Thumb2/carry.ll index 1e2b332..47c7918 100644 --- a/llvm/test/CodeGen/Thumb2/carry.ll +++ b/llvm/test/CodeGen/Thumb2/carry.ll @@ -1,35 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s define i64 @f1(i64 %a, i64 %b) { -entry: ; CHECK-LABEL: f1: -; CHECK: subs r0, r0, r2 -; CHECK: sbcs r1, r3 - %tmp = sub i64 %a, %b - ret i64 %tmp +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbcs r1, r3 +; CHECK-NEXT: bx lr +entry: + %tmp = sub i64 %a, %b + ret i64 %tmp } define i64 @f2(i64 %a, i64 %b) { -entry: ; CHECK-LABEL: f2: -; CHECK: lsls r1, r1, #1 -; CHECK: orr.w r1, r1, r0, lsr #31 -; CHECK: rsbs r0, r2, r0, lsl #1 -; CHECK: sbcs r1, r3 - %tmp1 = shl i64 %a, 1 - %tmp2 = sub i64 %tmp1, %b - ret i64 %tmp2 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsls r1, r1, #1 +; CHECK-NEXT: orr.w r1, r1, r0, lsr #31 +; CHECK-NEXT: rsbs r0, r2, r0, lsl #1 +; CHECK-NEXT: sbcs r1, r3 +; CHECK-NEXT: bx lr +entry: + %tmp1 = shl i64 %a, 1 + %tmp2 = sub i64 %tmp1, %b + ret i64 %tmp2 } ; rdar://12559385 define i64 @f3(i32 %vi) { -entry: ; CHECK-LABEL: f3: -; CHECK: movw [[REG:r[0-9]+]], #36102 -; CHECK: sbcs r{{[0-9]+}}, [[REG]] - %v0 = zext i32 %vi to i64 - %v1 = xor i64 %v0, -155057456198619 - %v4 = add i64 %v1, 155057456198619 - %v5 = add i64 %v4, %v1 - ret i64 %v5 +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movw r1, #19493 +; CHECK-NEXT: movt r1, #57191 +; CHECK-NEXT: eors r0, r1 +; CHECK-NEXT: movw r2, #29433 +; CHECK-NEXT: movw r3, #46043 +; CHECK-NEXT: movw r1, #36102 +; CHECK-NEXT: movt r2, #65535 +; CHECK-NEXT: adds r0, r0, r0 +; CHECK-NEXT: movt r3, #8344 +; CHECK-NEXT: sbcs r2, r1 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: bx lr +entry: + %v0 = zext i32 %vi to i64 + %v1 = xor i64 %v0, -155057456198619 + %v4 = add i64 %v1, 155057456198619 + %v5 = add i64 %v4, %v1 + ret i64 %v5 } diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll index ae170d7..d949068 100644 --- a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll +++ b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll @@ -104,6 +104,31 @@ define void @memset_i32(ptr %dest, i8 %val, i32 %len) { ret void } +; CHECK-LABEL: memcpy_0: +; CHECK-NEXT: .functype memcpy_0 (i32, i32) -> () +; CHECK-NEXT: return +define void @memcpy_0(ptr %dest, ptr %src) { + call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0) + ret void +} + +; CHECK-LABEL: memmove_0: +; CHECK-NEXT: .functype memmove_0 (i32, i32) -> () +; CHECK-NEXT: return +define void @memmove_0(ptr %dest, ptr %src) { + call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0) + ret void +} + +; CHECK-LABEL: memset_0: +; NO-BULK-MEM-NOT: memory.fill +; BULK-MEM-NEXT: .functype memset_0 (i32, i32) -> () +; BULK-MEM-NEXT: return +define void @memset_0(ptr %dest, i8 %val) { + call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 0, i1 0) + ret void +} + ; CHECK-LABEL: memcpy_1: ; CHECK-NEXT: .functype memcpy_1 (i32, i32) -> () ; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1) @@ -137,14 +162,8 @@ define void @memset_1(ptr %dest, i8 %val) { ; CHECK-LABEL: memcpy_1024: ; NO-BULK-MEM-NOT: memory.copy ; BULK-MEM-NEXT: .functype memcpy_1024 (i32, i32) -> () -; BULK-MEM-NEXT: block ; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]] -; BULK-MEM-NEXT: br_if 0, $pop[[L1]] -; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]] ; BULK-MEM-NEXT: return define void @memcpy_1024(ptr %dest, ptr %src) { call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0) @@ -154,14 +173,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) { ; CHECK-LABEL: memmove_1024: ; NO-BULK-MEM-NOT: memory.copy ; BULK-MEM-NEXT: .functype memmove_1024 (i32, i32) -> () -; BULK-MEM-NEXT: block ; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]] -; BULK-MEM-NEXT: br_if 0, $pop[[L1]] -; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]] ; BULK-MEM-NEXT: return define void @memmove_1024(ptr %dest, ptr %src) { call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0) @@ -171,14 +184,8 @@ define void @memmove_1024(ptr %dest, ptr %src) { ; CHECK-LABEL: memset_1024: ; NO-BULK-MEM-NOT: memory.fill ; BULK-MEM-NEXT: .functype memset_1024 (i32, i32) -> () -; BULK-MEM-NEXT: block ; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]] -; BULK-MEM-NEXT: br_if 0, $pop[[L1]] -; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L2]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]] ; BULK-MEM-NEXT: return define void @memset_1024(ptr %dest, i8 %val) { call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 1024, i1 0) @@ -201,17 +208,11 @@ define void @memset_1024(ptr %dest, i8 %val) { ; BULK-MEM-NEXT: .functype memcpy_alloca_src (i32) -> () ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112 -; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]] -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100 -; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]] -; BULK-MEM-NEXT: br_if 0, $pop[[L4]] -; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12 -; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]] -; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100 -; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]] +; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12 +; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]] +; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100 +; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]] ; BULK-MEM-NEXT: return define void @memcpy_alloca_src(ptr %dst) { %a = alloca [100 x i8] @@ -224,17 +225,11 @@ define void @memcpy_alloca_src(ptr %dst) { ; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i32) -> () ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112 -; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]] -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100 -; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]] -; BULK-MEM-NEXT: br_if 0, $pop[[L4]] -; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12 -; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]] -; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100 -; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]] +; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12 +; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]] +; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100 +; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]] ; BULK-MEM-NEXT: return define void @memcpy_alloca_dst(ptr %src) { %a = alloca [100 x i8] @@ -247,17 +242,11 @@ define void @memcpy_alloca_dst(ptr %src) { ; BULK-MEM-NEXT: .functype memset_alloca (i32) -> () ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112 -; BULK-MEM-NEXT: i32.sub $1=, $pop[[L0]], $pop[[L1]] -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 100 -; BULK-MEM-NEXT: i32.eqz $push[[L3:[0-9]+]]=, $pop[[L2]] -; BULK-MEM-NEXT: br_if 0, $pop[[L3]] -; BULK-MEM-NEXT: i32.const $push[[L4:[0-9]+]]=, 12 -; BULK-MEM-NEXT: i32.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]] -; BULK-MEM-NEXT: i32.const $push[[L6:[0-9]+]]=, 100 -; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]] +; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12 +; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]] +; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100 +; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]] ; BULK-MEM-NEXT: return define void @memset_alloca(i8 %val) { %a = alloca [100 x i8] diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll index 0cf8493..d0206a3 100644 --- a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll +++ b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll @@ -110,6 +110,31 @@ define void @memset_i32(ptr %dest, i8 %val, i64 %len) { ret void } +; CHECK-LABEL: memcpy_0: +; CHECK-NEXT: .functype memcpy_0 (i64, i64) -> () +; CHECK-NEXT: return +define void @memcpy_0(ptr %dest, ptr %src) { + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0) + ret void +} + +; CHECK-LABEL: memmove_0: +; CHECK-NEXT: .functype memmove_0 (i64, i64) -> () +; CHECK-NEXT: return +define void @memmove_0(ptr %dest, ptr %src) { + call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0) + ret void +} + +; CHECK-LABEL: memset_0: +; NO-BULK-MEM-NOT: memory.fill +; BULK-MEM-NEXT: .functype memset_0 (i64, i32) -> () +; BULK-MEM-NEXT: return +define void @memset_0(ptr %dest, i8 %val) { + call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 0, i1 0) + ret void +} + ; CHECK-LABEL: memcpy_1: ; CHECK-NEXT: .functype memcpy_1 (i64, i64) -> () ; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1) @@ -143,14 +168,8 @@ define void @memset_1(ptr %dest, i8 %val) { ; CHECK-LABEL: memcpy_1024: ; NO-BULK-MEM-NOT: memory.copy ; BULK-MEM-NEXT: .functype memcpy_1024 (i64, i64) -> () -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]] -; BULK-MEM-NEXT: br_if 0, $pop0 ; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block ; BULK-MEM-NEXT: return define void @memcpy_1024(ptr %dest, ptr %src) { call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0) @@ -160,14 +179,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) { ; CHECK-LABEL: memmove_1024: ; NO-BULK-MEM-NOT: memory.copy ; BULK-MEM-NEXT: .functype memmove_1024 (i64, i64) -> () -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]] -; BULK-MEM-NEXT: br_if 0, $pop0 ; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block ; BULK-MEM-NEXT: return define void @memmove_1024(ptr %dest, ptr %src) { call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0) @@ -177,14 +190,8 @@ define void @memmove_1024(ptr %dest, ptr %src) { ; CHECK-LABEL: memset_1024: ; NO-BULK-MEM-NOT: memory.fill ; BULK-MEM-NEXT: .functype memset_1024 (i64, i32) -> () -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 1024 -; BULK-MEM-NEXT: i64.eqz $push0=, $pop[[L1]] -; BULK-MEM-NEXT: br_if 0, $pop0 ; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 1024 ; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block ; BULK-MEM-NEXT: return define void @memset_1024(ptr %dest, i8 %val) { call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 1024, i1 0) @@ -207,17 +214,11 @@ define void @memset_1024(ptr %dest, i8 %val) { ; BULK-MEM-NEXT: .functype memcpy_alloca_src (i64) -> () ; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer ; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112 -; BULK-MEM-NEXT: i64.sub $[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]] -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 100 -; BULK-MEM-NEXT: i64.eqz $push[[L4:[0-9]+]]=, $pop[[L3]] -; BULK-MEM-NEXT: br_if 0, $pop[[L4]] -; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 12 -; BULK-MEM-NEXT: i64.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]] -; BULK-MEM-NEXT: i64.const $push[[L7:[0-9]+]]=, 100 -; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]] +; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12 +; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]] +; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100 +; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]] ; BULK-MEM-NEXT: return define void @memcpy_alloca_src(ptr %dst) { %a = alloca [100 x i8] @@ -230,17 +231,11 @@ define void @memcpy_alloca_src(ptr %dst) { ; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i64) -> () ; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer ; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112 -; BULK-MEM-NEXT: i64.sub $[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]] -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 100 -; BULK-MEM-NEXT: i64.eqz $push[[L4:[0-9]+]]=, $pop[[L3]] -; BULK-MEM-NEXT: br_if 0, $pop[[L4]] -; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 12 -; BULK-MEM-NEXT: i64.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]] -; BULK-MEM-NEXT: i64.const $push[[L7:[0-9]+]]=, 100 -; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]] +; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12 +; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]] +; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100 +; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]] ; BULK-MEM-NEXT: return define void @memcpy_alloca_dst(ptr %src) { %a = alloca [100 x i8] @@ -253,17 +248,11 @@ define void @memcpy_alloca_dst(ptr %src) { ; BULK-MEM-NEXT: .functype memset_alloca (i32) -> () ; BULK-MEM-NEXT: global.get $push[[L1:[0-9]+]]=, __stack_pointer ; BULK-MEM-NEXT: i64.const $push[[L0:[0-9]+]]=, 112 -; BULK-MEM-NEXT: i64.sub $1=, $pop[[L1]], $pop[[L0]] -; BULK-MEM-NEXT: block -; BULK-MEM-NEXT: i64.const $push[[L2:[0-9]+]]=, 100 -; BULK-MEM-NEXT: i64.eqz $push[[L3:[0-9]+]]=, $pop[[L2]] -; BULK-MEM-NEXT: br_if 0, $pop[[L3]] -; BULK-MEM-NEXT: i64.const $push[[L4:[0-9]+]]=, 12 -; BULK-MEM-NEXT: i64.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]] -; BULK-MEM-NEXT: i64.const $push[[L6:[0-9]+]]=, 100 -; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]] -; BULK-MEM-NEXT: .LBB{{.*}}: -; BULK-MEM-NEXT: end_block +; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]] +; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12 +; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]] +; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100 +; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]] ; BULK-MEM-NEXT: return define void @memset_alloca(i8 %val) { %a = alloca [100 x i8] diff --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll new file mode 100644 index 0000000..abbd953 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=mvp -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s + +; This test ensures that loads and stores generated for small memcpy et al use +; constant offset folding. + + +target triple = "wasm32-unknown-unknown" + +define void @call_memset(ptr) #0 { +; CHECK-LABEL: call_memset: +; CHECK: .functype call_memset (i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.const $push0=, 0 +; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0 +; CHECK-NEXT: i64.const $push1=, 0 +; CHECK-NEXT: i64.store 0($0):p2align=0, $pop1 +; CHECK-NEXT: # fallthrough-return + call void @llvm.memset.p0.i32(ptr align 1 %0, i8 0, i32 16, i1 false) + ret void +} + +define void @call_memcpy(ptr %dst, ptr %src) #0 { +; CHECK-LABEL: call_memcpy: +; CHECK: .functype call_memcpy (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0 +; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0 +; CHECK-NEXT: i64.load $push1=, 0($1):p2align=0 +; CHECK-NEXT: i64.store 0($0):p2align=0, $pop1 +; CHECK-NEXT: # fallthrough-return + call void @llvm.memcpy.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false) + ret void +} + + +define void @call_memmove(ptr %dst, ptr %src) #0 { +; CHECK-LABEL: call_memmove: +; CHECK: .functype call_memmove (i32, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $2=, 0($1):p2align=0 +; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0 +; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0 +; CHECK-NEXT: i64.store 0($0):p2align=0, $2 +; CHECK-NEXT: # fallthrough-return + call void @llvm.memmove.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false) + ret void +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll new file mode 100644 index 0000000..3654aae --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mattr=+simd128 | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: dot_sext_1: +; CHECK: .functype dot_sext_1 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.dot_i16x8_s +; CHECK-NEXT: # fallthrough-return + %sext1 = sext <8 x i16> %a to <8 x i32> + %sext2 = sext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %sext1, %sext2 + %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %res = add <4 x i32> %shuffle1, %shuffle2 + ret <4 x i32> %res +} + + +define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: dot_sext_2: +; CHECK: .functype dot_sext_2 (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.dot_i16x8_s +; CHECK-NEXT: # fallthrough-return + %sext1 = sext <8 x i16> %a to <8 x i32> + %sext2 = sext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %sext1, %sext2 + %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %res = add <4 x i32> %shuffle2, %shuffle1 + ret <4 x i32> %res +} + +define <4 x i32> @dot_sext_self(<8 x i16> %v) { +; CHECK-LABEL: dot_sext_self: +; CHECK: .functype dot_sext_self (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32x4.dot_i16x8_s +; CHECK-NEXT: # fallthrough-return + %sext = sext <8 x i16> %v to <8 x i32> + %mul = mul <8 x i32> %sext, %sext + %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %res = add <4 x i32> %shuffle1, %shuffle2 + ret <4 x i32> %res +} + +; INFO: Negative test +define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: dot_zext: +; CHECK: .functype dot_zext (v128, v128) -> (v128) +; CHECK-NEXT: .local v128 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extmul_low_i16x8_u +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extmul_high_i16x8_u +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 +; CHECK-NEXT: i32x4.add +; CHECK-NEXT: # fallthrough-return + %zext1 = zext <8 x i16> %a to <8 x i32> + %zext2 = zext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %zext1, %zext2 + %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> + %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> + %res = add <4 x i32> %shuffle1, %shuffle2 + ret <4 x i32> %res +} + +; INFO: Negative test +define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: dot_wrong_shuffle: +; CHECK: .functype dot_wrong_shuffle (v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extmul_low_i16x8_s +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32x4.extmul_high_i16x8_s +; CHECK-NEXT: i32x4.add +; CHECK-NEXT: # fallthrough-return + %sext1 = sext <8 x i16> %a to <8 x i32> + %sext2 = sext <8 x i16> %b to <8 x i32> + %mul = mul <8 x i32> %sext1, %sext2 + %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %res = add <4 x i32> %shuffle1, %shuffle2 + ret <4 x i32> %res +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll index e065de3..600241a 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll @@ -2,9 +2,278 @@ ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128, | FileCheck %s --check-prefix=STRICT +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=NOFP16 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefix=NOSIMD target triple = "wasm32" +define half @fadd_fmul_contract_f16(half %a, half %b, half %c) { +; RELAXED-LABEL: fadd_fmul_contract_f16: +; RELAXED: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: call $push0=, __truncsfhf2, $0 +; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0 +; RELAXED-NEXT: call $push2=, __truncsfhf2, $1 +; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2 +; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3 +; RELAXED-NEXT: call $push5=, __truncsfhf2, $2 +; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5 +; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6 +; RELAXED-NEXT: return $pop7 +; +; STRICT-LABEL: fadd_fmul_contract_f16: +; STRICT: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: call $push0=, __truncsfhf2, $0 +; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0 +; STRICT-NEXT: call $push2=, __truncsfhf2, $1 +; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2 +; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3 +; STRICT-NEXT: call $push5=, __truncsfhf2, $2 +; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5 +; STRICT-NEXT: f32.add $push7=, $pop4, $pop6 +; STRICT-NEXT: return $pop7 +; +; NOFP16-LABEL: fadd_fmul_contract_f16: +; NOFP16: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, __truncsfhf2, $0 +; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOFP16-NEXT: call $push2=, __truncsfhf2, $1 +; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOFP16-NEXT: call $push5=, __truncsfhf2, $2 +; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6 +; NOFP16-NEXT: return $pop7 +; +; NOSIMD-LABEL: fadd_fmul_contract_f16: +; NOSIMD: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, __truncsfhf2, $0 +; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOSIMD-NEXT: call $push2=, __truncsfhf2, $1 +; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2 +; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6 +; NOSIMD-NEXT: return $pop7 + %mul = fmul contract half %b, %a + %add = fadd contract half %mul, %c + ret half %add +} + +define half @fmuladd_contract_f16(half %a, half %b, half %c) { +; RELAXED-LABEL: fmuladd_contract_f16: +; RELAXED: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: call $push0=, __truncsfhf2, $1 +; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0 +; RELAXED-NEXT: call $push2=, __truncsfhf2, $0 +; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2 +; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3 +; RELAXED-NEXT: call $push5=, __truncsfhf2, $2 +; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5 +; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6 +; RELAXED-NEXT: return $pop7 +; +; STRICT-LABEL: fmuladd_contract_f16: +; STRICT: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: call $push0=, __truncsfhf2, $1 +; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0 +; STRICT-NEXT: call $push2=, __truncsfhf2, $0 +; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2 +; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3 +; STRICT-NEXT: call $push5=, __truncsfhf2, $2 +; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5 +; STRICT-NEXT: f32.add $push7=, $pop4, $pop6 +; STRICT-NEXT: return $pop7 +; +; NOFP16-LABEL: fmuladd_contract_f16: +; NOFP16: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, __truncsfhf2, $1 +; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOFP16-NEXT: call $push2=, __truncsfhf2, $0 +; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOFP16-NEXT: call $push5=, __truncsfhf2, $2 +; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6 +; NOFP16-NEXT: return $pop7 +; +; NOSIMD-LABEL: fmuladd_contract_f16: +; NOSIMD: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, __truncsfhf2, $1 +; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOSIMD-NEXT: call $push2=, __truncsfhf2, $0 +; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2 +; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6 +; NOSIMD-NEXT: return $pop7 + %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c) + ret half %fma +} + +define half @fmuladd_f16(half %a, half %b, half %c) { +; RELAXED-LABEL: fmuladd_f16: +; RELAXED: .functype fmuladd_f16 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: call $push0=, __truncsfhf2, $1 +; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0 +; RELAXED-NEXT: call $push2=, __truncsfhf2, $0 +; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2 +; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3 +; RELAXED-NEXT: call $push5=, __truncsfhf2, $2 +; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5 +; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6 +; RELAXED-NEXT: return $pop7 +; +; STRICT-LABEL: fmuladd_f16: +; STRICT: .functype fmuladd_f16 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: call $push0=, __truncsfhf2, $1 +; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0 +; STRICT-NEXT: call $push2=, __truncsfhf2, $0 +; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2 +; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3 +; STRICT-NEXT: call $push5=, __truncsfhf2, $2 +; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5 +; STRICT-NEXT: f32.add $push7=, $pop4, $pop6 +; STRICT-NEXT: return $pop7 +; +; NOFP16-LABEL: fmuladd_f16: +; NOFP16: .functype fmuladd_f16 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, __truncsfhf2, $1 +; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOFP16-NEXT: call $push2=, __truncsfhf2, $0 +; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOFP16-NEXT: call $push5=, __truncsfhf2, $2 +; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6 +; NOFP16-NEXT: return $pop7 +; +; NOSIMD-LABEL: fmuladd_f16: +; NOSIMD: .functype fmuladd_f16 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, __truncsfhf2, $1 +; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOSIMD-NEXT: call $push2=, __truncsfhf2, $0 +; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2 +; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6 +; NOSIMD-NEXT: return $pop7 + %fma = call half @llvm.fmuladd(half %a, half %b, half %c) + ret half %fma +} + + +define float @fadd_fmul_contract_f32(float %a, float %b, float %c) { +; RELAXED-LABEL: fadd_fmul_contract_f32: +; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32.mul $push0=, $1, $0 +; RELAXED-NEXT: f32.add $push1=, $pop0, $2 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fadd_fmul_contract_f32: +; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32.mul $push0=, $1, $0 +; STRICT-NEXT: f32.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fadd_fmul_contract_f32: +; NOFP16: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32.mul $push0=, $1, $0 +; NOFP16-NEXT: f32.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fadd_fmul_contract_f32: +; NOSIMD: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $1, $0 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $2 +; NOSIMD-NEXT: return $pop1 + %mul = fmul contract float %b, %a + %add = fadd contract float %mul, %c + ret float %add +} + +define float @fmuladd_contract_f32(float %a, float %b, float %c) { +; RELAXED-LABEL: fmuladd_contract_f32: +; RELAXED: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32.mul $push0=, $0, $1 +; RELAXED-NEXT: f32.add $push1=, $pop0, $2 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fmuladd_contract_f32: +; STRICT: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32.mul $push0=, $0, $1 +; STRICT-NEXT: f32.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_contract_f32: +; NOFP16: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32.mul $push0=, $0, $1 +; NOFP16-NEXT: f32.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_contract_f32: +; NOSIMD: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $0, $1 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $2 +; NOSIMD-NEXT: return $pop1 + %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c) + ret float %fma +} + +define float @fmuladd_f32(float %a, float %b, float %c) { +; RELAXED-LABEL: fmuladd_f32: +; RELAXED: .functype fmuladd_f32 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32.mul $push0=, $0, $1 +; RELAXED-NEXT: f32.add $push1=, $pop0, $2 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fmuladd_f32: +; STRICT: .functype fmuladd_f32 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32.mul $push0=, $0, $1 +; STRICT-NEXT: f32.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_f32: +; NOFP16: .functype fmuladd_f32 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32.mul $push0=, $0, $1 +; NOFP16-NEXT: f32.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_f32: +; NOSIMD: .functype fmuladd_f32 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $0, $1 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $2 +; NOSIMD-NEXT: return $pop1 + %fma = call float @llvm.fmuladd(float %a, float %b, float %c) + ret float %fma +} + define double @fadd_fmul_contract_f64(double %a, double %b, double %c) { ; RELAXED-LABEL: fadd_fmul_contract_f64: ; RELAXED: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64) @@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) { ; STRICT-NEXT: f64.mul $push0=, $1, $0 ; STRICT-NEXT: f64.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fadd_fmul_contract_f64: +; NOFP16: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64.mul $push0=, $1, $0 +; NOFP16-NEXT: f64.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fadd_fmul_contract_f64: +; NOSIMD: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $1, $0 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $2 +; NOSIMD-NEXT: return $pop1 %mul = fmul contract double %b, %a %add = fadd contract double %mul, %c ret double %add } +define double @fmuladd_f64(double %a, double %b, double %c) { +; RELAXED-LABEL: fmuladd_f64: +; RELAXED: .functype fmuladd_f64 (f64, f64, f64) -> (f64) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64.mul $push0=, $0, $1 +; RELAXED-NEXT: f64.add $push1=, $pop0, $2 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fmuladd_f64: +; STRICT: .functype fmuladd_f64 (f64, f64, f64) -> (f64) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64.mul $push0=, $0, $1 +; STRICT-NEXT: f64.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_f64: +; NOFP16: .functype fmuladd_f64 (f64, f64, f64) -> (f64) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64.mul $push0=, $0, $1 +; NOFP16-NEXT: f64.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_f64: +; NOSIMD: .functype fmuladd_f64 (f64, f64, f64) -> (f64) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $0, $1 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $2 +; NOSIMD-NEXT: return $pop1 + %fma = call double @llvm.fmuladd(double %a, double %b, double %c) + ret double %fma +} + +define double @fmuladd_contract_f64(double %a, double %b, double %c) { +; RELAXED-LABEL: fmuladd_contract_f64: +; RELAXED: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64.mul $push0=, $0, $1 +; RELAXED-NEXT: f64.add $push1=, $pop0, $2 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fmuladd_contract_f64: +; STRICT: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64.mul $push0=, $0, $1 +; STRICT-NEXT: f64.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_contract_f64: +; NOFP16: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64.mul $push0=, $0, $1 +; NOFP16-NEXT: f64.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_contract_f64: +; NOSIMD: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $0, $1 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $2 +; NOSIMD-NEXT: return $pop1 + %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c) + ret double %fma +} + define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; RELAXED-LABEL: fadd_fmul_contract_4xf32: ; RELAXED: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0 +; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $1, $0, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fadd_fmul_contract_4xf32: @@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 ; STRICT-NEXT: f32x4.mul $push0=, $1, $0 ; STRICT-NEXT: f32x4.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fadd_fmul_contract_4xf32: +; NOFP16: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.mul $push0=, $1, $0 +; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fadd_fmul_contract_4xf32: +; NOSIMD: .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $8, $4 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $12 +; NOSIMD-NEXT: f32.store 12($0), $pop1 +; NOSIMD-NEXT: f32.mul $push2=, $7, $3 +; NOSIMD-NEXT: f32.add $push3=, $pop2, $11 +; NOSIMD-NEXT: f32.store 8($0), $pop3 +; NOSIMD-NEXT: f32.mul $push4=, $6, $2 +; NOSIMD-NEXT: f32.add $push5=, $pop4, $10 +; NOSIMD-NEXT: f32.store 4($0), $pop5 +; NOSIMD-NEXT: f32.mul $push6=, $5, $1 +; NOSIMD-NEXT: f32.add $push7=, $pop6, $9 +; NOSIMD-NEXT: f32.store 0($0), $pop7 +; NOSIMD-NEXT: return %mul = fmul contract <4 x float> %b, %a %add = fadd contract <4 x float> %mul, %c ret <4 x float> %add } - define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; RELAXED-LABEL: fadd_fmul_contract_8xf16: ; RELAXED: .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f16x8.relaxed_madd $push0=, $2, $1, $0 +; RELAXED-NEXT: f16x8.madd $push0=, $1, $0, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fadd_fmul_contract_8xf16: ; STRICT: .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128) ; STRICT-NEXT: # %bb.0: -; STRICT-NEXT: f16x8.mul $push0=, $1, $0 -; STRICT-NEXT: f16x8.add $push1=, $pop0, $2 -; STRICT-NEXT: return $pop1 +; STRICT-NEXT: f16x8.madd $push0=, $1, $0, $2 +; STRICT-NEXT: return $pop0 +; +; NOFP16-LABEL: fadd_fmul_contract_8xf16: +; NOFP16: .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, __truncsfhf2, $8 +; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOFP16-NEXT: call $push2=, __truncsfhf2, $16 +; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOFP16-NEXT: call $push5=, __truncsfhf2, $24 +; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6 +; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7 +; NOFP16-NEXT: i32.store16 14($0), $pop8 +; NOFP16-NEXT: call $push9=, __truncsfhf2, $7 +; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9 +; NOFP16-NEXT: call $push11=, __truncsfhf2, $15 +; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11 +; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12 +; NOFP16-NEXT: call $push14=, __truncsfhf2, $23 +; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14 +; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15 +; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16 +; NOFP16-NEXT: i32.store16 12($0), $pop17 +; NOFP16-NEXT: call $push18=, __truncsfhf2, $6 +; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18 +; NOFP16-NEXT: call $push20=, __truncsfhf2, $14 +; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20 +; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21 +; NOFP16-NEXT: call $push23=, __truncsfhf2, $22 +; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23 +; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24 +; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25 +; NOFP16-NEXT: i32.store16 10($0), $pop26 +; NOFP16-NEXT: call $push27=, __truncsfhf2, $5 +; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27 +; NOFP16-NEXT: call $push29=, __truncsfhf2, $13 +; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29 +; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30 +; NOFP16-NEXT: call $push32=, __truncsfhf2, $21 +; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32 +; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33 +; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34 +; NOFP16-NEXT: i32.store16 8($0), $pop35 +; NOFP16-NEXT: call $push36=, __truncsfhf2, $4 +; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36 +; NOFP16-NEXT: call $push38=, __truncsfhf2, $12 +; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38 +; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39 +; NOFP16-NEXT: call $push41=, __truncsfhf2, $20 +; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41 +; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42 +; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43 +; NOFP16-NEXT: i32.store16 6($0), $pop44 +; NOFP16-NEXT: call $push45=, __truncsfhf2, $3 +; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45 +; NOFP16-NEXT: call $push47=, __truncsfhf2, $11 +; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47 +; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48 +; NOFP16-NEXT: call $push50=, __truncsfhf2, $19 +; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50 +; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51 +; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52 +; NOFP16-NEXT: i32.store16 4($0), $pop53 +; NOFP16-NEXT: call $push54=, __truncsfhf2, $2 +; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54 +; NOFP16-NEXT: call $push56=, __truncsfhf2, $10 +; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56 +; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57 +; NOFP16-NEXT: call $push59=, __truncsfhf2, $18 +; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59 +; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60 +; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61 +; NOFP16-NEXT: i32.store16 2($0), $pop62 +; NOFP16-NEXT: call $push63=, __truncsfhf2, $1 +; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63 +; NOFP16-NEXT: call $push65=, __truncsfhf2, $9 +; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65 +; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66 +; NOFP16-NEXT: call $push68=, __truncsfhf2, $17 +; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68 +; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69 +; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70 +; NOFP16-NEXT: i32.store16 0($0), $pop71 +; NOFP16-NEXT: return +; +; NOSIMD-LABEL: fadd_fmul_contract_8xf16: +; NOSIMD: .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, __truncsfhf2, $8 +; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOSIMD-NEXT: call $push2=, __truncsfhf2, $16 +; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24 +; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6 +; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7 +; NOSIMD-NEXT: i32.store16 14($0), $pop8 +; NOSIMD-NEXT: call $push9=, __truncsfhf2, $7 +; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9 +; NOSIMD-NEXT: call $push11=, __truncsfhf2, $15 +; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11 +; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12 +; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23 +; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14 +; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15 +; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16 +; NOSIMD-NEXT: i32.store16 12($0), $pop17 +; NOSIMD-NEXT: call $push18=, __truncsfhf2, $6 +; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18 +; NOSIMD-NEXT: call $push20=, __truncsfhf2, $14 +; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20 +; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21 +; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22 +; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23 +; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24 +; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25 +; NOSIMD-NEXT: i32.store16 10($0), $pop26 +; NOSIMD-NEXT: call $push27=, __truncsfhf2, $5 +; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27 +; NOSIMD-NEXT: call $push29=, __truncsfhf2, $13 +; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29 +; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30 +; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21 +; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32 +; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33 +; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34 +; NOSIMD-NEXT: i32.store16 8($0), $pop35 +; NOSIMD-NEXT: call $push36=, __truncsfhf2, $4 +; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36 +; NOSIMD-NEXT: call $push38=, __truncsfhf2, $12 +; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38 +; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39 +; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20 +; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41 +; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42 +; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43 +; NOSIMD-NEXT: i32.store16 6($0), $pop44 +; NOSIMD-NEXT: call $push45=, __truncsfhf2, $3 +; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45 +; NOSIMD-NEXT: call $push47=, __truncsfhf2, $11 +; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47 +; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48 +; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19 +; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50 +; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51 +; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52 +; NOSIMD-NEXT: i32.store16 4($0), $pop53 +; NOSIMD-NEXT: call $push54=, __truncsfhf2, $2 +; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54 +; NOSIMD-NEXT: call $push56=, __truncsfhf2, $10 +; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56 +; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57 +; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18 +; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59 +; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60 +; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61 +; NOSIMD-NEXT: i32.store16 2($0), $pop62 +; NOSIMD-NEXT: call $push63=, __truncsfhf2, $1 +; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63 +; NOSIMD-NEXT: call $push65=, __truncsfhf2, $9 +; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65 +; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66 +; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17 +; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68 +; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69 +; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70 +; NOSIMD-NEXT: i32.store16 0($0), $pop71 +; NOSIMD-NEXT: return %mul = fmul contract <8 x half> %b, %a %add = fadd contract <8 x half> %mul, %c ret <8 x half> %add } - define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; RELAXED-LABEL: fadd_fmul_4xf32: ; RELAXED: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128) @@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> ; STRICT-NEXT: f32x4.mul $push0=, $1, $0 ; STRICT-NEXT: f32x4.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fadd_fmul_4xf32: +; NOFP16: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.mul $push0=, $1, $0 +; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fadd_fmul_4xf32: +; NOSIMD: .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $8, $4 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $12 +; NOSIMD-NEXT: f32.store 12($0), $pop1 +; NOSIMD-NEXT: f32.mul $push2=, $7, $3 +; NOSIMD-NEXT: f32.add $push3=, $pop2, $11 +; NOSIMD-NEXT: f32.store 8($0), $pop3 +; NOSIMD-NEXT: f32.mul $push4=, $6, $2 +; NOSIMD-NEXT: f32.add $push5=, $pop4, $10 +; NOSIMD-NEXT: f32.store 4($0), $pop5 +; NOSIMD-NEXT: f32.mul $push6=, $5, $1 +; NOSIMD-NEXT: f32.add $push7=, $pop6, $9 +; NOSIMD-NEXT: f32.store 0($0), $pop7 +; NOSIMD-NEXT: return %mul = fmul <4 x float> %b, %a %add = fadd contract <4 x float> %mul, %c ret <4 x float> %add } +define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; RELAXED-LABEL: fmuladd_contract_8xf16: +; RELAXED: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f16x8.madd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_contract_8xf16: +; STRICT: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f16x8.madd $push0=, $0, $1, $2 +; STRICT-NEXT: return $pop0 +; +; NOFP16-LABEL: fmuladd_contract_8xf16: +; NOFP16: .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, __truncsfhf2, $16 +; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOFP16-NEXT: call $push2=, __truncsfhf2, $8 +; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOFP16-NEXT: call $push5=, __truncsfhf2, $24 +; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6 +; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7 +; NOFP16-NEXT: i32.store16 14($0), $pop8 +; NOFP16-NEXT: call $push9=, __truncsfhf2, $15 +; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9 +; NOFP16-NEXT: call $push11=, __truncsfhf2, $7 +; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11 +; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12 +; NOFP16-NEXT: call $push14=, __truncsfhf2, $23 +; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14 +; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15 +; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16 +; NOFP16-NEXT: i32.store16 12($0), $pop17 +; NOFP16-NEXT: call $push18=, __truncsfhf2, $14 +; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18 +; NOFP16-NEXT: call $push20=, __truncsfhf2, $6 +; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20 +; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21 +; NOFP16-NEXT: call $push23=, __truncsfhf2, $22 +; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23 +; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24 +; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25 +; NOFP16-NEXT: i32.store16 10($0), $pop26 +; NOFP16-NEXT: call $push27=, __truncsfhf2, $13 +; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27 +; NOFP16-NEXT: call $push29=, __truncsfhf2, $5 +; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29 +; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30 +; NOFP16-NEXT: call $push32=, __truncsfhf2, $21 +; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32 +; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33 +; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34 +; NOFP16-NEXT: i32.store16 8($0), $pop35 +; NOFP16-NEXT: call $push36=, __truncsfhf2, $12 +; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36 +; NOFP16-NEXT: call $push38=, __truncsfhf2, $4 +; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38 +; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39 +; NOFP16-NEXT: call $push41=, __truncsfhf2, $20 +; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41 +; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42 +; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43 +; NOFP16-NEXT: i32.store16 6($0), $pop44 +; NOFP16-NEXT: call $push45=, __truncsfhf2, $11 +; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45 +; NOFP16-NEXT: call $push47=, __truncsfhf2, $3 +; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47 +; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48 +; NOFP16-NEXT: call $push50=, __truncsfhf2, $19 +; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50 +; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51 +; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52 +; NOFP16-NEXT: i32.store16 4($0), $pop53 +; NOFP16-NEXT: call $push54=, __truncsfhf2, $10 +; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54 +; NOFP16-NEXT: call $push56=, __truncsfhf2, $2 +; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56 +; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57 +; NOFP16-NEXT: call $push59=, __truncsfhf2, $18 +; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59 +; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60 +; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61 +; NOFP16-NEXT: i32.store16 2($0), $pop62 +; NOFP16-NEXT: call $push63=, __truncsfhf2, $9 +; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63 +; NOFP16-NEXT: call $push65=, __truncsfhf2, $1 +; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65 +; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66 +; NOFP16-NEXT: call $push68=, __truncsfhf2, $17 +; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68 +; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69 +; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70 +; NOFP16-NEXT: i32.store16 0($0), $pop71 +; NOFP16-NEXT: return +; +; NOSIMD-LABEL: fmuladd_contract_8xf16: +; NOSIMD: .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, __truncsfhf2, $16 +; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOSIMD-NEXT: call $push2=, __truncsfhf2, $8 +; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24 +; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6 +; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7 +; NOSIMD-NEXT: i32.store16 14($0), $pop8 +; NOSIMD-NEXT: call $push9=, __truncsfhf2, $15 +; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9 +; NOSIMD-NEXT: call $push11=, __truncsfhf2, $7 +; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11 +; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12 +; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23 +; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14 +; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15 +; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16 +; NOSIMD-NEXT: i32.store16 12($0), $pop17 +; NOSIMD-NEXT: call $push18=, __truncsfhf2, $14 +; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18 +; NOSIMD-NEXT: call $push20=, __truncsfhf2, $6 +; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20 +; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21 +; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22 +; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23 +; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24 +; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25 +; NOSIMD-NEXT: i32.store16 10($0), $pop26 +; NOSIMD-NEXT: call $push27=, __truncsfhf2, $13 +; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27 +; NOSIMD-NEXT: call $push29=, __truncsfhf2, $5 +; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29 +; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30 +; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21 +; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32 +; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33 +; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34 +; NOSIMD-NEXT: i32.store16 8($0), $pop35 +; NOSIMD-NEXT: call $push36=, __truncsfhf2, $12 +; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36 +; NOSIMD-NEXT: call $push38=, __truncsfhf2, $4 +; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38 +; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39 +; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20 +; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41 +; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42 +; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43 +; NOSIMD-NEXT: i32.store16 6($0), $pop44 +; NOSIMD-NEXT: call $push45=, __truncsfhf2, $11 +; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45 +; NOSIMD-NEXT: call $push47=, __truncsfhf2, $3 +; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47 +; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48 +; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19 +; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50 +; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51 +; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52 +; NOSIMD-NEXT: i32.store16 4($0), $pop53 +; NOSIMD-NEXT: call $push54=, __truncsfhf2, $10 +; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54 +; NOSIMD-NEXT: call $push56=, __truncsfhf2, $2 +; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56 +; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57 +; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18 +; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59 +; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60 +; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61 +; NOSIMD-NEXT: i32.store16 2($0), $pop62 +; NOSIMD-NEXT: call $push63=, __truncsfhf2, $9 +; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63 +; NOSIMD-NEXT: call $push65=, __truncsfhf2, $1 +; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65 +; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66 +; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17 +; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68 +; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69 +; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70 +; NOSIMD-NEXT: i32.store16 0($0), $pop71 +; NOSIMD-NEXT: return + %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c) + ret <8 x half> %fma +} + +define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; RELAXED-LABEL: fmuladd_8xf16: +; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f16x8.madd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_8xf16: +; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f16x8.madd $push0=, $0, $1, $2 +; STRICT-NEXT: return $pop0 +; +; NOFP16-LABEL: fmuladd_8xf16: +; NOFP16: .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, __truncsfhf2, $16 +; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOFP16-NEXT: call $push2=, __truncsfhf2, $8 +; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOFP16-NEXT: call $push5=, __truncsfhf2, $24 +; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6 +; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7 +; NOFP16-NEXT: i32.store16 14($0), $pop8 +; NOFP16-NEXT: call $push9=, __truncsfhf2, $15 +; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9 +; NOFP16-NEXT: call $push11=, __truncsfhf2, $7 +; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11 +; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12 +; NOFP16-NEXT: call $push14=, __truncsfhf2, $23 +; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14 +; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15 +; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16 +; NOFP16-NEXT: i32.store16 12($0), $pop17 +; NOFP16-NEXT: call $push18=, __truncsfhf2, $14 +; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18 +; NOFP16-NEXT: call $push20=, __truncsfhf2, $6 +; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20 +; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21 +; NOFP16-NEXT: call $push23=, __truncsfhf2, $22 +; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23 +; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24 +; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25 +; NOFP16-NEXT: i32.store16 10($0), $pop26 +; NOFP16-NEXT: call $push27=, __truncsfhf2, $13 +; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27 +; NOFP16-NEXT: call $push29=, __truncsfhf2, $5 +; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29 +; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30 +; NOFP16-NEXT: call $push32=, __truncsfhf2, $21 +; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32 +; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33 +; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34 +; NOFP16-NEXT: i32.store16 8($0), $pop35 +; NOFP16-NEXT: call $push36=, __truncsfhf2, $12 +; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36 +; NOFP16-NEXT: call $push38=, __truncsfhf2, $4 +; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38 +; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39 +; NOFP16-NEXT: call $push41=, __truncsfhf2, $20 +; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41 +; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42 +; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43 +; NOFP16-NEXT: i32.store16 6($0), $pop44 +; NOFP16-NEXT: call $push45=, __truncsfhf2, $11 +; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45 +; NOFP16-NEXT: call $push47=, __truncsfhf2, $3 +; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47 +; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48 +; NOFP16-NEXT: call $push50=, __truncsfhf2, $19 +; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50 +; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51 +; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52 +; NOFP16-NEXT: i32.store16 4($0), $pop53 +; NOFP16-NEXT: call $push54=, __truncsfhf2, $10 +; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54 +; NOFP16-NEXT: call $push56=, __truncsfhf2, $2 +; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56 +; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57 +; NOFP16-NEXT: call $push59=, __truncsfhf2, $18 +; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59 +; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60 +; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61 +; NOFP16-NEXT: i32.store16 2($0), $pop62 +; NOFP16-NEXT: call $push63=, __truncsfhf2, $9 +; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63 +; NOFP16-NEXT: call $push65=, __truncsfhf2, $1 +; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65 +; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66 +; NOFP16-NEXT: call $push68=, __truncsfhf2, $17 +; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68 +; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69 +; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70 +; NOFP16-NEXT: i32.store16 0($0), $pop71 +; NOFP16-NEXT: return +; +; NOSIMD-LABEL: fmuladd_8xf16: +; NOSIMD: .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, __truncsfhf2, $16 +; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0 +; NOSIMD-NEXT: call $push2=, __truncsfhf2, $8 +; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2 +; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3 +; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24 +; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5 +; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6 +; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7 +; NOSIMD-NEXT: i32.store16 14($0), $pop8 +; NOSIMD-NEXT: call $push9=, __truncsfhf2, $15 +; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9 +; NOSIMD-NEXT: call $push11=, __truncsfhf2, $7 +; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11 +; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12 +; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23 +; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14 +; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15 +; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16 +; NOSIMD-NEXT: i32.store16 12($0), $pop17 +; NOSIMD-NEXT: call $push18=, __truncsfhf2, $14 +; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18 +; NOSIMD-NEXT: call $push20=, __truncsfhf2, $6 +; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20 +; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21 +; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22 +; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23 +; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24 +; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25 +; NOSIMD-NEXT: i32.store16 10($0), $pop26 +; NOSIMD-NEXT: call $push27=, __truncsfhf2, $13 +; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27 +; NOSIMD-NEXT: call $push29=, __truncsfhf2, $5 +; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29 +; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30 +; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21 +; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32 +; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33 +; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34 +; NOSIMD-NEXT: i32.store16 8($0), $pop35 +; NOSIMD-NEXT: call $push36=, __truncsfhf2, $12 +; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36 +; NOSIMD-NEXT: call $push38=, __truncsfhf2, $4 +; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38 +; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39 +; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20 +; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41 +; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42 +; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43 +; NOSIMD-NEXT: i32.store16 6($0), $pop44 +; NOSIMD-NEXT: call $push45=, __truncsfhf2, $11 +; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45 +; NOSIMD-NEXT: call $push47=, __truncsfhf2, $3 +; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47 +; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48 +; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19 +; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50 +; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51 +; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52 +; NOSIMD-NEXT: i32.store16 4($0), $pop53 +; NOSIMD-NEXT: call $push54=, __truncsfhf2, $10 +; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54 +; NOSIMD-NEXT: call $push56=, __truncsfhf2, $2 +; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56 +; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57 +; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18 +; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59 +; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60 +; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61 +; NOSIMD-NEXT: i32.store16 2($0), $pop62 +; NOSIMD-NEXT: call $push63=, __truncsfhf2, $9 +; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63 +; NOSIMD-NEXT: call $push65=, __truncsfhf2, $1 +; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65 +; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66 +; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17 +; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68 +; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69 +; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70 +; NOSIMD-NEXT: i32.store16 0($0), $pop71 +; NOSIMD-NEXT: return + %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c) + ret <8 x half> %fma +} + define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; RELAXED-LABEL: fmuladd_contract_4xf32: ; RELAXED: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1 +; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fmuladd_contract_4xf32: @@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x ; STRICT-NEXT: f32x4.mul $push0=, $0, $1 ; STRICT-NEXT: f32x4.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_contract_4xf32: +; NOFP16: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.mul $push0=, $0, $1 +; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_contract_4xf32: +; NOSIMD: .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $4, $8 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $12 +; NOSIMD-NEXT: f32.store 12($0), $pop1 +; NOSIMD-NEXT: f32.mul $push2=, $3, $7 +; NOSIMD-NEXT: f32.add $push3=, $pop2, $11 +; NOSIMD-NEXT: f32.store 8($0), $pop3 +; NOSIMD-NEXT: f32.mul $push4=, $2, $6 +; NOSIMD-NEXT: f32.add $push5=, $pop4, $10 +; NOSIMD-NEXT: f32.store 4($0), $pop5 +; NOSIMD-NEXT: f32.mul $push6=, $1, $5 +; NOSIMD-NEXT: f32.add $push7=, $pop6, $9 +; NOSIMD-NEXT: f32.store 0($0), $pop7 +; NOSIMD-NEXT: return %fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c) ret <4 x float> %fma } -; TODO: This should also have relaxed_madd in RELAXED case define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; RELAXED-LABEL: fmuladd_4xf32: ; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32x4.mul $push0=, $0, $1 -; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2 -; RELAXED-NEXT: return $pop1 +; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fmuladd_4xf32: ; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128) @@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c ; STRICT-NEXT: f32x4.mul $push0=, $0, $1 ; STRICT-NEXT: f32x4.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_4xf32: +; NOFP16: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.mul $push0=, $0, $1 +; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_4xf32: +; NOSIMD: .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $4, $8 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $12 +; NOSIMD-NEXT: f32.store 12($0), $pop1 +; NOSIMD-NEXT: f32.mul $push2=, $3, $7 +; NOSIMD-NEXT: f32.add $push3=, $pop2, $11 +; NOSIMD-NEXT: f32.store 8($0), $pop3 +; NOSIMD-NEXT: f32.mul $push4=, $2, $6 +; NOSIMD-NEXT: f32.add $push5=, $pop4, $10 +; NOSIMD-NEXT: f32.store 4($0), $pop5 +; NOSIMD-NEXT: f32.mul $push6=, $1, $5 +; NOSIMD-NEXT: f32.add $push7=, $pop6, $9 +; NOSIMD-NEXT: f32.store 0($0), $pop7 +; NOSIMD-NEXT: return %fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c) ret <4 x float> %fma } +define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) { +; RELAXED-LABEL: fmuladd_8xf32: +; RELAXED: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32x4.mul $push0=, $2, $4 +; RELAXED-NEXT: f32x4.add $push1=, $pop0, $6 +; RELAXED-NEXT: v128.store 16($0), $pop1 +; RELAXED-NEXT: f32x4.mul $push2=, $1, $3 +; RELAXED-NEXT: f32x4.add $push3=, $pop2, $5 +; RELAXED-NEXT: v128.store 0($0), $pop3 +; RELAXED-NEXT: return +; +; STRICT-LABEL: fmuladd_8xf32: +; STRICT: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32x4.mul $push0=, $2, $4 +; STRICT-NEXT: f32x4.add $push1=, $pop0, $6 +; STRICT-NEXT: v128.store 16($0), $pop1 +; STRICT-NEXT: f32x4.mul $push2=, $1, $3 +; STRICT-NEXT: f32x4.add $push3=, $pop2, $5 +; STRICT-NEXT: v128.store 0($0), $pop3 +; STRICT-NEXT: return +; +; NOFP16-LABEL: fmuladd_8xf32: +; NOFP16: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.mul $push0=, $2, $4 +; NOFP16-NEXT: f32x4.add $push1=, $pop0, $6 +; NOFP16-NEXT: v128.store 16($0), $pop1 +; NOFP16-NEXT: f32x4.mul $push2=, $1, $3 +; NOFP16-NEXT: f32x4.add $push3=, $pop2, $5 +; NOFP16-NEXT: v128.store 0($0), $pop3 +; NOFP16-NEXT: return +; +; NOSIMD-LABEL: fmuladd_8xf32: +; NOSIMD: .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $8, $16 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $24 +; NOSIMD-NEXT: f32.store 28($0), $pop1 +; NOSIMD-NEXT: f32.mul $push2=, $7, $15 +; NOSIMD-NEXT: f32.add $push3=, $pop2, $23 +; NOSIMD-NEXT: f32.store 24($0), $pop3 +; NOSIMD-NEXT: f32.mul $push4=, $6, $14 +; NOSIMD-NEXT: f32.add $push5=, $pop4, $22 +; NOSIMD-NEXT: f32.store 20($0), $pop5 +; NOSIMD-NEXT: f32.mul $push6=, $5, $13 +; NOSIMD-NEXT: f32.add $push7=, $pop6, $21 +; NOSIMD-NEXT: f32.store 16($0), $pop7 +; NOSIMD-NEXT: f32.mul $push8=, $4, $12 +; NOSIMD-NEXT: f32.add $push9=, $pop8, $20 +; NOSIMD-NEXT: f32.store 12($0), $pop9 +; NOSIMD-NEXT: f32.mul $push10=, $3, $11 +; NOSIMD-NEXT: f32.add $push11=, $pop10, $19 +; NOSIMD-NEXT: f32.store 8($0), $pop11 +; NOSIMD-NEXT: f32.mul $push12=, $2, $10 +; NOSIMD-NEXT: f32.add $push13=, $pop12, $18 +; NOSIMD-NEXT: f32.store 4($0), $pop13 +; NOSIMD-NEXT: f32.mul $push14=, $1, $9 +; NOSIMD-NEXT: f32.add $push15=, $pop14, $17 +; NOSIMD-NEXT: f32.store 0($0), $pop15 +; NOSIMD-NEXT: return + %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c) + ret <8 x float> %fma +} + +define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; RELAXED-LABEL: fmuladd_contract_2xf64: +; RELAXED: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_contract_2xf64: +; STRICT: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64x2.mul $push0=, $0, $1 +; STRICT-NEXT: f64x2.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_contract_2xf64: +; NOFP16: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64x2.mul $push0=, $0, $1 +; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_contract_2xf64: +; NOSIMD: .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $2, $4 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $6 +; NOSIMD-NEXT: f64.store 8($0), $pop1 +; NOSIMD-NEXT: f64.mul $push2=, $1, $3 +; NOSIMD-NEXT: f64.add $push3=, $pop2, $5 +; NOSIMD-NEXT: f64.store 0($0), $pop3 +; NOSIMD-NEXT: return + %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c) + ret <2 x double> %fma +} + +define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; RELAXED-LABEL: fmuladd_2xf64: +; RELAXED: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_2xf64: +; STRICT: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64x2.mul $push0=, $0, $1 +; STRICT-NEXT: f64x2.add $push1=, $pop0, $2 +; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fmuladd_2xf64: +; NOFP16: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64x2.mul $push0=, $0, $1 +; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fmuladd_2xf64: +; NOSIMD: .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $2, $4 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $6 +; NOSIMD-NEXT: f64.store 8($0), $pop1 +; NOSIMD-NEXT: f64.mul $push2=, $1, $3 +; NOSIMD-NEXT: f64.add $push3=, $pop2, $5 +; NOSIMD-NEXT: f64.store 0($0), $pop3 +; NOSIMD-NEXT: return + %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c) + ret <2 x double> %fma +} + define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; RELAXED-LABEL: fma_4xf32: ; RELAXED: .functype fma_4xf32 (v128, v128, v128) -> (v128) @@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; STRICT-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15 ; STRICT-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18 ; STRICT-NEXT: return $pop19 +; +; NOFP16-LABEL: fma_4xf32: +; NOFP16: .functype fma_4xf32 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.extract_lane $push2=, $0, 0 +; NOFP16-NEXT: f32x4.extract_lane $push1=, $1, 0 +; NOFP16-NEXT: f32x4.extract_lane $push0=, $2, 0 +; NOFP16-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0 +; NOFP16-NEXT: f32x4.splat $push4=, $pop3 +; NOFP16-NEXT: f32x4.extract_lane $push7=, $0, 1 +; NOFP16-NEXT: f32x4.extract_lane $push6=, $1, 1 +; NOFP16-NEXT: f32x4.extract_lane $push5=, $2, 1 +; NOFP16-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5 +; NOFP16-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8 +; NOFP16-NEXT: f32x4.extract_lane $push12=, $0, 2 +; NOFP16-NEXT: f32x4.extract_lane $push11=, $1, 2 +; NOFP16-NEXT: f32x4.extract_lane $push10=, $2, 2 +; NOFP16-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10 +; NOFP16-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13 +; NOFP16-NEXT: f32x4.extract_lane $push17=, $0, 3 +; NOFP16-NEXT: f32x4.extract_lane $push16=, $1, 3 +; NOFP16-NEXT: f32x4.extract_lane $push15=, $2, 3 +; NOFP16-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15 +; NOFP16-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18 +; NOFP16-NEXT: return $pop19 +; +; NOSIMD-LABEL: fma_4xf32: +; NOSIMD: .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, fmaf, $4, $8, $12 +; NOSIMD-NEXT: f32.store 12($0), $pop0 +; NOSIMD-NEXT: call $push1=, fmaf, $3, $7, $11 +; NOSIMD-NEXT: f32.store 8($0), $pop1 +; NOSIMD-NEXT: call $push2=, fmaf, $2, $6, $10 +; NOSIMD-NEXT: f32.store 4($0), $pop2 +; NOSIMD-NEXT: call $push3=, fmaf, $1, $5, $9 +; NOSIMD-NEXT: f32.store 0($0), $pop3 +; NOSIMD-NEXT: return %fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c) ret <4 x float> %fma } @@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 ; RELAXED-LABEL: fadd_fmul_contract_8xf32: ; RELAXED: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2 +; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $4, $2, $6 ; RELAXED-NEXT: v128.store 16($0), $pop0 -; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1 +; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $3, $1, $5 ; RELAXED-NEXT: v128.store 0($0), $pop1 ; RELAXED-NEXT: return ; @@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 ; STRICT-NEXT: f32x4.add $push3=, $pop2, $5 ; STRICT-NEXT: v128.store 0($0), $pop3 ; STRICT-NEXT: return +; +; NOFP16-LABEL: fadd_fmul_contract_8xf32: +; NOFP16: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f32x4.mul $push0=, $4, $2 +; NOFP16-NEXT: f32x4.add $push1=, $pop0, $6 +; NOFP16-NEXT: v128.store 16($0), $pop1 +; NOFP16-NEXT: f32x4.mul $push2=, $3, $1 +; NOFP16-NEXT: f32x4.add $push3=, $pop2, $5 +; NOFP16-NEXT: v128.store 0($0), $pop3 +; NOFP16-NEXT: return +; +; NOSIMD-LABEL: fadd_fmul_contract_8xf32: +; NOSIMD: .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f32.mul $push0=, $16, $8 +; NOSIMD-NEXT: f32.add $push1=, $pop0, $24 +; NOSIMD-NEXT: f32.store 28($0), $pop1 +; NOSIMD-NEXT: f32.mul $push2=, $15, $7 +; NOSIMD-NEXT: f32.add $push3=, $pop2, $23 +; NOSIMD-NEXT: f32.store 24($0), $pop3 +; NOSIMD-NEXT: f32.mul $push4=, $14, $6 +; NOSIMD-NEXT: f32.add $push5=, $pop4, $22 +; NOSIMD-NEXT: f32.store 20($0), $pop5 +; NOSIMD-NEXT: f32.mul $push6=, $13, $5 +; NOSIMD-NEXT: f32.add $push7=, $pop6, $21 +; NOSIMD-NEXT: f32.store 16($0), $pop7 +; NOSIMD-NEXT: f32.mul $push8=, $12, $4 +; NOSIMD-NEXT: f32.add $push9=, $pop8, $20 +; NOSIMD-NEXT: f32.store 12($0), $pop9 +; NOSIMD-NEXT: f32.mul $push10=, $11, $3 +; NOSIMD-NEXT: f32.add $push11=, $pop10, $19 +; NOSIMD-NEXT: f32.store 8($0), $pop11 +; NOSIMD-NEXT: f32.mul $push12=, $10, $2 +; NOSIMD-NEXT: f32.add $push13=, $pop12, $18 +; NOSIMD-NEXT: f32.store 4($0), $pop13 +; NOSIMD-NEXT: f32.mul $push14=, $9, $1 +; NOSIMD-NEXT: f32.add $push15=, $pop14, $17 +; NOSIMD-NEXT: f32.store 0($0), $pop15 +; NOSIMD-NEXT: return %mul = fmul contract <8 x float> %b, %a %add = fadd contract <8 x float> %mul, %c ret <8 x float> %add } - define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; RELAXED-LABEL: fadd_fmul_contract_2xf64: ; RELAXED: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0 +; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $1, $0, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fadd_fmul_contract_2xf64: @@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, ; STRICT-NEXT: f64x2.mul $push0=, $1, $0 ; STRICT-NEXT: f64x2.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 +; +; NOFP16-LABEL: fadd_fmul_contract_2xf64: +; NOFP16: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64x2.mul $push0=, $1, $0 +; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fadd_fmul_contract_2xf64: +; NOSIMD: .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $4, $2 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $6 +; NOSIMD-NEXT: f64.store 8($0), $pop1 +; NOSIMD-NEXT: f64.mul $push2=, $3, $1 +; NOSIMD-NEXT: f64.add $push3=, $pop2, $5 +; NOSIMD-NEXT: f64.store 0($0), $pop3 +; NOSIMD-NEXT: return %mul = fmul contract <2 x double> %b, %a %add = fadd contract <2 x double> %mul, %c ret <2 x double> %add } -define float @fadd_fmul_contract_f32(float %a, float %b, float %c) { -; RELAXED-LABEL: fadd_fmul_contract_f32: -; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32) +define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; RELAXED-LABEL: fadd_fmul_2xf64: +; RELAXED: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32.mul $push0=, $1, $0 -; RELAXED-NEXT: f32.add $push1=, $pop0, $2 +; RELAXED-NEXT: f64x2.mul $push0=, $1, $0 +; RELAXED-NEXT: f64x2.add $push1=, $pop0, $2 ; RELAXED-NEXT: return $pop1 ; -; STRICT-LABEL: fadd_fmul_contract_f32: -; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32) +; STRICT-LABEL: fadd_fmul_2xf64: +; STRICT: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128) ; STRICT-NEXT: # %bb.0: -; STRICT-NEXT: f32.mul $push0=, $1, $0 -; STRICT-NEXT: f32.add $push1=, $pop0, $2 +; STRICT-NEXT: f64x2.mul $push0=, $1, $0 +; STRICT-NEXT: f64x2.add $push1=, $pop0, $2 ; STRICT-NEXT: return $pop1 - %mul = fmul contract float %b, %a - %add = fadd contract float %mul, %c - ret float %add +; +; NOFP16-LABEL: fadd_fmul_2xf64: +; NOFP16: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: f64x2.mul $push0=, $1, $0 +; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2 +; NOFP16-NEXT: return $pop1 +; +; NOSIMD-LABEL: fadd_fmul_2xf64: +; NOSIMD: .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> () +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: f64.mul $push0=, $4, $2 +; NOSIMD-NEXT: f64.add $push1=, $pop0, $6 +; NOSIMD-NEXT: f64.store 8($0), $pop1 +; NOSIMD-NEXT: f64.mul $push2=, $3, $1 +; NOSIMD-NEXT: f64.add $push3=, $pop2, $5 +; NOSIMD-NEXT: f64.store 0($0), $pop3 +; NOSIMD-NEXT: return + %mul = fmul <2 x double> %b, %a + %add = fadd <2 x double> %mul, %c + ret <2 x double> %add } define float @fma_f32(float %a, float %b, float %c) { @@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) { ; STRICT-NEXT: # %bb.0: ; STRICT-NEXT: call $push0=, fmaf, $0, $1, $2 ; STRICT-NEXT: return $pop0 +; +; NOFP16-LABEL: fma_f32: +; NOFP16: .functype fma_f32 (f32, f32, f32) -> (f32) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, fmaf, $0, $1, $2 +; NOFP16-NEXT: return $pop0 +; +; NOSIMD-LABEL: fma_f32: +; NOSIMD: .functype fma_f32 (f32, f32, f32) -> (f32) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, fmaf, $0, $1, $2 +; NOSIMD-NEXT: return $pop0 %fma = call float @llvm.fma(float %a, float %b, float %c) ret float %fma } @@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) { ; STRICT-NEXT: # %bb.0: ; STRICT-NEXT: call $push0=, fma, $0, $1, $2 ; STRICT-NEXT: return $pop0 +; +; NOFP16-LABEL: fma_f64: +; NOFP16: .functype fma_f64 (f64, f64, f64) -> (f64) +; NOFP16-NEXT: # %bb.0: +; NOFP16-NEXT: call $push0=, fma, $0, $1, $2 +; NOFP16-NEXT: return $pop0 +; +; NOSIMD-LABEL: fma_f64: +; NOSIMD: .functype fma_f64 (f64, f64, f64) -> (f64) +; NOSIMD-NEXT: # %bb.0: +; NOSIMD-NEXT: call $push0=, fma, $0, $1, $2 +; NOSIMD-NEXT: return $pop0 %fma = call double @llvm.fma(double %a, double %b, double %c) ret double %fma } diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll index 6e2d860..b90c1da 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll @@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 ; RELAXED-LABEL: fsub_fmul_contract_4xf32: ; RELAXED: .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $2, $1, $0 +; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $1, $0, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fsub_fmul_contract_4xf32: @@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h ; RELAXED-LABEL: fsub_fmul_contract_8xf16: ; RELAXED: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f16x8.relaxed_nmadd $push0=, $2, $1, $0 +; RELAXED-NEXT: f16x8.nmadd $push0=, $1, $0, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fsub_fmul_contract_8xf16: ; STRICT: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128) ; STRICT-NEXT: # %bb.0: -; STRICT-NEXT: f16x8.mul $push0=, $1, $0 -; STRICT-NEXT: f16x8.sub $push1=, $2, $pop0 -; STRICT-NEXT: return $pop1 +; STRICT-NEXT: f16x8.nmadd $push0=, $1, $0, $2 +; STRICT-NEXT: return $pop0 %mul = fmul contract <8 x half> %b, %a %sub = fsub contract <8 x half> %c, %mul ret <8 x half> %sub @@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 ; RELAXED-LABEL: fsub_fmul_contract_8xf32: ; RELAXED: .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $6, $4, $2 +; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $4, $2, $6 ; RELAXED-NEXT: v128.store 16($0), $pop0 -; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $5, $3, $1 +; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $3, $1, $5 ; RELAXED-NEXT: v128.store 0($0), $pop1 ; RELAXED-NEXT: return ; @@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, ; RELAXED-LABEL: fsub_fmul_contract_2xf64: ; RELAXED: .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128) ; RELAXED-NEXT: # %bb.0: -; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $2, $1, $0 +; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $1, $0, $2 ; RELAXED-NEXT: return $pop0 ; ; STRICT-LABEL: fsub_fmul_contract_2xf64: @@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) { ret float %sub } +define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; RELAXED-LABEL: fmuladd_8xf16: +; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f16x8.nmadd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_8xf16: +; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f16x8.nmadd $push0=, $0, $1, $2 +; STRICT-NEXT: return $pop0 + %fneg = fneg <8 x half> %a + %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c) + ret <8 x half> %fma +} + +define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; RELAXED-LABEL: fmuladd_4xf32: +; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_4xf32: +; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32x4.mul $push0=, $0, $1 +; STRICT-NEXT: f32x4.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %fneg = fneg <4 x float> %a + %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c) + ret <4 x float> %fma +} + +define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; RELAXED-LABEL: fmuladd_2xf64: +; RELAXED: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $0, $1, $2 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fmuladd_2xf64: +; STRICT: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64x2.mul $push0=, $0, $1 +; STRICT-NEXT: f64x2.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %fneg = fneg <2 x double> %a + %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c) + ret <2 x double> %fma +} diff --git a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll index 28b4541..7bdc4e1 100644 --- a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll +++ b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll @@ -44,7 +44,7 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) { ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: callq __ubyte_convert_to_ctype ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: js LBB0_6 +; CHECK-NEXT: js LBB0_4 ; CHECK-NEXT: ## %bb.1: ## %cond_next.i ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: movq %rbx, %rdi @@ -53,84 +53,81 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) { ; CHECK-NEXT: sarl $31, %ecx ; CHECK-NEXT: andl %eax, %ecx ; CHECK-NEXT: cmpl $-2, %ecx -; CHECK-NEXT: je LBB0_10 +; CHECK-NEXT: je LBB0_8 ; CHECK-NEXT: ## %bb.2: ## %cond_next.i ; CHECK-NEXT: cmpl $-1, %ecx -; CHECK-NEXT: jne LBB0_3 -; CHECK-NEXT: LBB0_8: ## %bb4 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: LBB0_3: ## %bb4 ; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: movq 16(%rax), %rax -; CHECK-NEXT: jmp LBB0_9 -; CHECK-NEXT: LBB0_6: ## %_ubyte_convert2_to_ctypes.exit +; CHECK-NEXT: jmp LBB0_10 +; CHECK-NEXT: LBB0_4: ## %_ubyte_convert2_to_ctypes.exit ; CHECK-NEXT: cmpl $-2, %eax -; CHECK-NEXT: je LBB0_10 -; CHECK-NEXT: ## %bb.7: ## %_ubyte_convert2_to_ctypes.exit -; CHECK-NEXT: cmpl $-1, %eax ; CHECK-NEXT: je LBB0_8 -; CHECK-NEXT: LBB0_3: ## %bb35 +; CHECK-NEXT: ## %bb.5: ## %_ubyte_convert2_to_ctypes.exit +; CHECK-NEXT: cmpl $-1, %eax +; CHECK-NEXT: je LBB0_3 +; CHECK-NEXT: LBB0_6: ## %bb35 ; CHECK-NEXT: movq _PyUFunc_API@GOTPCREL(%rip), %r14 ; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: callq *216(%rax) ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: testb %dl, %dl -; CHECK-NEXT: je LBB0_4 -; CHECK-NEXT: ## %bb.12: ## %cond_false.i -; CHECK-NEXT: setne %dil +; CHECK-NEXT: je LBB0_11 +; CHECK-NEXT: ## %bb.7: ## %cond_false.i ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-NEXT: movzbl %sil, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: divb %dl ; CHECK-NEXT: movl %eax, %r15d ; CHECK-NEXT: testb %cl, %cl -; CHECK-NEXT: setne %al -; CHECK-NEXT: testb %dil, %al -; CHECK-NEXT: jne LBB0_5 -; CHECK-NEXT: LBB0_13: ## %cond_true.i200 -; CHECK-NEXT: testb %dl, %dl -; CHECK-NEXT: jne LBB0_15 -; CHECK-NEXT: ## %bb.14: ## %cond_true14.i -; CHECK-NEXT: movl $4, %edi -; CHECK-NEXT: callq _feraiseexcept -; CHECK-NEXT: LBB0_15: ## %ubyte_ctype_remainder.exit -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: jmp LBB0_16 -; CHECK-NEXT: LBB0_10: ## %bb17 +; CHECK-NEXT: jne LBB0_12 +; CHECK-NEXT: jmp LBB0_14 +; CHECK-NEXT: LBB0_8: ## %bb17 ; CHECK-NEXT: callq _PyErr_Occurred ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: jne LBB0_23 -; CHECK-NEXT: ## %bb.11: ## %cond_next +; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: ## %bb.9: ## %cond_next ; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: movq 80(%rax), %rax -; CHECK-NEXT: LBB0_9: ## %bb4 +; CHECK-NEXT: LBB0_10: ## %bb4 ; CHECK-NEXT: movq 96(%rax), %rax ; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: movq %rbx, %rsi ; CHECK-NEXT: callq *40(%rax) -; CHECK-NEXT: jmp LBB0_24 -; CHECK-NEXT: LBB0_4: ## %cond_true.i +; CHECK-NEXT: jmp LBB0_28 +; CHECK-NEXT: LBB0_11: ## %cond_true.i ; CHECK-NEXT: movl $4, %edi ; CHECK-NEXT: callq _feraiseexcept ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: xorl %r15d, %r15d ; CHECK-NEXT: testb %sil, %sil -; CHECK-NEXT: sete %al +; CHECK-NEXT: je LBB0_14 +; CHECK-NEXT: LBB0_12: ## %cond_false.i ; CHECK-NEXT: testb %dl, %dl -; CHECK-NEXT: sete %cl -; CHECK-NEXT: xorl %r15d, %r15d -; CHECK-NEXT: orb %al, %cl -; CHECK-NEXT: jne LBB0_13 -; CHECK-NEXT: LBB0_5: ## %cond_next17.i +; CHECK-NEXT: je LBB0_14 +; CHECK-NEXT: ## %bb.13: ## %cond_next17.i ; CHECK-NEXT: movzbl %sil, %eax ; CHECK-NEXT: divb %dl ; CHECK-NEXT: movzbl %ah, %ebx -; CHECK-NEXT: LBB0_16: ## %ubyte_ctype_remainder.exit +; CHECK-NEXT: jmp LBB0_18 +; CHECK-NEXT: LBB0_14: ## %cond_true.i200 +; CHECK-NEXT: testb %dl, %dl +; CHECK-NEXT: jne LBB0_17 +; CHECK-NEXT: ## %bb.16: ## %cond_true14.i +; CHECK-NEXT: movl $4, %edi +; CHECK-NEXT: callq _feraiseexcept +; CHECK-NEXT: LBB0_17: ## %ubyte_ctype_remainder.exit +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: LBB0_18: ## %ubyte_ctype_remainder.exit ; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: callq *224(%rax) ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: je LBB0_19 -; CHECK-NEXT: ## %bb.17: ## %cond_true61 +; CHECK-NEXT: je LBB0_21 +; CHECK-NEXT: ## %bb.19: ## %cond_true61 ; CHECK-NEXT: movl %eax, %ebp ; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: movq _.str5@GOTPCREL(%rip), %rdi @@ -139,8 +136,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) { ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: callq *200(%rax) ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: js LBB0_23 -; CHECK-NEXT: ## %bb.18: ## %cond_next73 +; CHECK-NEXT: js LBB0_27 +; CHECK-NEXT: ## %bb.20: ## %cond_next73 ; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq (%r14), %rax ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi @@ -149,13 +146,13 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) { ; CHECK-NEXT: movl %ebp, %edx ; CHECK-NEXT: callq *232(%rax) ; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: jne LBB0_23 -; CHECK-NEXT: LBB0_19: ## %cond_next89 +; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: LBB0_21: ## %cond_next89 ; CHECK-NEXT: movl $2, %edi ; CHECK-NEXT: callq _PyTuple_New ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: je LBB0_23 -; CHECK-NEXT: ## %bb.20: ## %cond_next97 +; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: ## %bb.22: ## %cond_next97 ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: movq _PyArray_API@GOTPCREL(%rip), %r12 ; CHECK-NEXT: movq (%r12), %rax @@ -163,8 +160,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) { ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: callq *304(%rdi) ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: je LBB0_21 -; CHECK-NEXT: ## %bb.25: ## %cond_next135 +; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: ## %bb.23: ## %cond_next135 ; CHECK-NEXT: movb %r15b, 16(%rax) ; CHECK-NEXT: movq %rax, 24(%r14) ; CHECK-NEXT: movq (%r12), %rax @@ -172,22 +169,22 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) { ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: callq *304(%rdi) ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: je LBB0_21 -; CHECK-NEXT: ## %bb.26: ## %cond_next182 +; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: ## %bb.24: ## %cond_next182 ; CHECK-NEXT: movb %bl, 16(%rax) ; CHECK-NEXT: movq %rax, 32(%r14) ; CHECK-NEXT: movq %r14, %rax -; CHECK-NEXT: jmp LBB0_24 -; CHECK-NEXT: LBB0_21: ## %cond_true113 +; CHECK-NEXT: jmp LBB0_28 +; CHECK-NEXT: LBB0_25: ## %cond_true113 ; CHECK-NEXT: decq (%r14) -; CHECK-NEXT: jne LBB0_23 -; CHECK-NEXT: ## %bb.22: ## %cond_true126 +; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: ## %bb.26: ## %cond_true126 ; CHECK-NEXT: movq 8(%r14), %rax ; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: callq *48(%rax) -; CHECK-NEXT: LBB0_23: ## %UnifiedReturnBlock +; CHECK-NEXT: LBB0_27: ## %UnifiedReturnBlock ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: LBB0_24: ## %UnifiedReturnBlock +; CHECK-NEXT: LBB0_28: ## %UnifiedReturnBlock ; CHECK-NEXT: addq $32, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 0de308a..5152c005 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -728,45 +728,70 @@ define void @avg_v32i8_2(ptr %a, ptr %b) nounwind { define void @avg_v64i8_2(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v64i8_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rsi), %xmm0 -; SSE2-NEXT: movaps 16(%rsi), %xmm1 -; SSE2-NEXT: movaps 32(%rsi), %xmm2 -; SSE2-NEXT: movaps 48(%rsi), %xmm3 -; SSE2-NEXT: movups %xmm3, (%rax) -; SSE2-NEXT: movups %xmm2, (%rax) -; SSE2-NEXT: movups %xmm1, (%rax) -; SSE2-NEXT: movups %xmm0, (%rax) +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 +; SSE2-NEXT: pavgb 48(%rsi), %xmm3 +; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgb 48(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) -; AVX2-NEXT: vmovups %ymm0, (%rax) +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: avg_v64i8_2: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rsi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: avg_v64i8_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v64i8_2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %1 = load <64 x i8>, ptr %a %2 = load <64 x i8>, ptr %b %3 = zext <64 x i8> %1 to <64 x i32> %4 = zext <64 x i8> %2 to <64 x i32> - %5 = add nuw nsw <64 x i32> %4, %4 + %5 = add nuw nsw <64 x i32> %3, %4 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %8 = trunc <64 x i32> %7 to <64 x i8> @@ -774,7 +799,6 @@ define void @avg_v64i8_2(ptr %a, ptr %b) nounwind { ret void } - define void @avg_v4i16_2(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v4i16_2: ; SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll index a0c243b..f3950b7 100644 --- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll +++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll @@ -1,16 +1,15 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -;; A minimal test case. llc will crash if global variables already has a section -;; prefix. Subsequent PRs will expand on this test case to test the hotness -;; reconciliation implementation. - -; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +;; A minimal test case. Subsequent PRs will expand on this test case +;; (e.g., with more functions, variables and profiles) and test the hotness +;; reconcillation implementation. +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ ; RUN: -partition-static-data-sections=true \ ; RUN: -data-sections=true -unique-section-names=false \ -; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=ERR +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=IR -; ERR: Global variable hot_bss already has a section prefix hot +; IR: .section .bss.hot.,"aw" @hot_bss = internal global i32 0, !section_prefix !17 diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll index ce06d17..604b4fd 100644 --- a/llvm/test/CodeGen/X86/global-variable-partition.ll +++ b/llvm/test/CodeGen/X86/global-variable-partition.ll @@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu" ; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,8 ; AGG-NEXT: .section .data.unlikely.,"aw",@progbits +;; The `.section` directive is omitted for .data with -unique-section-names=false. +; See MCSectionELF::shouldOmitSectionDirective for the implementation details. + ; For @data_with_unknown_hotness ; SYM: .type .Ldata_with_unknown_hotness,@object # @data_with_unknown_hotness ; SYM: .section .data..Ldata_with_unknown_hotness,"aw",@progbits ; UNIQ: .section .data,"aw",@progbits,unique,9 -; The `.section` directive is omitted for .data with -unique-section-names=false. -; See MCSectionELF::shouldOmitSectionDirective for the implementation details. + ; AGG: .data ; COMMON: .Ldata_with_unknown_hotness: -; For @hot_data_custom_bar_section -; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix. +; For variables that are not eligible for section prefix annotation ; COMMON: .type hot_data_custom_bar_section,@object ; SYM-NEXT: .section bar,"aw",@progbits ; SYM: hot_data_custom_bar_section ; UNIQ: .section bar,"aw",@progbits ; AGG: .section bar,"aw",@progbits +; SYM: .section .data.llvm.fake_var,"aw" +; UNIQ: .section .data,"aw" +; AGG: .data + +;; No section for linker declaration +; COMMON-NOT: qux + @.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1 @.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1 @hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3] @@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu" @data3 = internal global i32 3 @data_with_unknown_hotness = private global i32 5 @hot_data_custom_bar_section = internal global i32 101 #0 +@llvm.fake_var = internal global i32 123 +@qux = external global i64 define void @cold_func(i32 %0) !prof !15 { %2 = load i32, ptr @cold_bss diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll index df04b67..c2b7068 100644 --- a/llvm/test/CodeGen/X86/isel-fpclass.ll +++ b/llvm/test/CodeGen/X86/isel-fpclass.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X86-FASTISEL ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1 | FileCheck %s -check-prefixes=X64,X64-FASTISEL -; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2 | FileCheck %s -check-prefixes=X64,X64-GISEL +; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X86,X86-GISEL +; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1 | FileCheck %s -check-prefixes=X64-GISEL define i1 @isnone_f(float %x) nounwind { ; X86-LABEL: isnone_f: @@ -23,6 +23,11 @@ define i1 @isnone_f(float %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: xorl %eax, %eax ; X86-FASTISEL-NEXT: retl +; +; X64-GISEL-LABEL: isnone_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: xorl %eax, %eax +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0) ret i1 %0 @@ -45,22 +50,27 @@ define i1 @isany_f(float %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: movb $1, %al ; X86-FASTISEL-NEXT: retl +; +; X64-GISEL-LABEL: isany_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movb $1, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023) ret i1 %0 } define i1 @issignaling_f(float %x) nounwind { -; X86-LABEL: issignaling_f: -; X86: # %bb.0: -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-NEXT: setl %cl -; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; X86-NEXT: setge %al -; X86-NEXT: andb %cl, %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: issignaling_f: +; X86-SDAGISEL: # %bb.0: +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-SDAGISEL-NEXT: setl %cl +; X86-SDAGISEL-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-SDAGISEL-NEXT: setge %al +; X86-SDAGISEL-NEXT: andb %cl, %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: issignaling_f: ; X64: # %bb.0: @@ -87,18 +97,44 @@ define i1 @issignaling_f(float %x) nounwind { ; X86-FASTISEL-NEXT: andb %cl, %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: issignaling_f: +; X86-GISEL: # %bb.0: +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: seta %dl +; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-GISEL-NEXT: setb %al +; X86-GISEL-NEXT: andb %dl, %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: issignaling_f: +; X64-GISEL: # %bb.0: +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: seta %dl +; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GISEL-NEXT: setb %al +; X64-GISEL-NEXT: andb %dl, %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan" ret i1 %a0 } define i1 @isquiet_f(float %x) nounwind { -; X86-LABEL: isquiet_f: -; X86: # %bb.0: # %entry -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-NEXT: setge %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: isquiet_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-SDAGISEL-NEXT: setge %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: isquiet_f: ; X64: # %bb.0: # %entry @@ -119,19 +155,39 @@ define i1 @issignaling_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: isquiet_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-GISEL-NEXT: setae %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: isquiet_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GISEL-NEXT: setae %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan" ret i1 %0 } define i1 @not_isquiet_f(float %x) nounwind { -; X86-LABEL: not_isquiet_f: -; X86: # %bb.0: # %entry -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; X86-NEXT: setl %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: not_isquiet_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-SDAGISEL-NEXT: setl %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: not_isquiet_f: ; X64: # %bb.0: # %entry @@ -152,19 +208,57 @@ define i1 @not_isquiet_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: not_isquiet_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: setb %dl +; X86-GISEL-NEXT: orb %cl, %dl +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: sete %cl +; X86-GISEL-NEXT: orb %dl, %cl +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: seta %dl +; X86-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-GISEL-NEXT: setb %al +; X86-GISEL-NEXT: andb %dl, %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: not_isquiet_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: setb %dl +; X64-GISEL-NEXT: orb %cl, %dl +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: sete %cl +; X64-GISEL-NEXT: orb %dl, %cl +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: seta %dl +; X64-GISEL-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GISEL-NEXT: setb %al +; X64-GISEL-NEXT: andb %dl, %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan" ret i1 %0 } define i1 @isinf_f(float %x) nounwind { -; X86-LABEL: isinf_f: -; X86: # %bb.0: # %entry -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: isinf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: sete %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: isinf_f: ; X64: # %bb.0: # %entry @@ -185,19 +279,39 @@ define i1 @isinf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: isinf_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: sete %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: isinf_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: sete %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" ret i1 %0 } define i1 @not_isinf_f(float %x) nounwind { -; X86-LABEL: not_isinf_f: -; X86: # %bb.0: # %entry -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: not_isinf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setne %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: not_isinf_f: ; X64: # %bb.0: # %entry @@ -218,17 +332,43 @@ define i1 @not_isinf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: not_isinf_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: setb %dl +; X86-GISEL-NEXT: orb %cl, %dl +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: seta %al +; X86-GISEL-NEXT: orb %dl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: not_isinf_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: setb %dl +; X64-GISEL-NEXT: orb %cl, %dl +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: seta %al +; X64-GISEL-NEXT: orb %dl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf" ret i1 %0 } define i1 @is_plus_inf_f(float %x) nounwind { -; X86-LABEL: is_plus_inf_f: -; X86: # %bb.0: # %entry -; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: is_plus_inf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: sete %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: is_plus_inf_f: ; X64: # %bb.0: # %entry @@ -246,17 +386,34 @@ define i1 @is_plus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: is_plus_inf_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-GISEL-NEXT: sete %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: is_plus_inf_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: sete %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" ret i1 %0 } define i1 @is_minus_inf_f(float %x) nounwind { -; X86-LABEL: is_minus_inf_f: -; X86: # %bb.0: # %entry -; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-NEXT: sete %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: is_minus_inf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-SDAGISEL-NEXT: sete %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -274,17 +431,34 @@ define i1 @is_minus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: sete %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: is_minus_inf_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-GISEL-NEXT: sete %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: is_minus_inf_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-GISEL-NEXT: sete %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" ret i1 %0 } define i1 @not_is_minus_inf_f(float %x) nounwind { -; X86-LABEL: not_is_minus_inf_f: -; X86: # %bb.0: # %entry -; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; X86-NEXT: setne %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: not_is_minus_inf_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-SDAGISEL-NEXT: setne %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: not_is_minus_inf_f: ; X64: # %bb.0: # %entry @@ -302,19 +476,55 @@ define i1 @not_is_minus_inf_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setne %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: not_is_minus_inf_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: pushl %ebx +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: movl %eax, %ecx +; X86-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %edx, %edx +; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X86-GISEL-NEXT: setb %bl +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: sete %ah +; X86-GISEL-NEXT: orb %dl, %ah +; X86-GISEL-NEXT: orb %bl, %ah +; X86-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X86-GISEL-NEXT: seta %al +; X86-GISEL-NEXT: orb %ah, %al +; X86-GISEL-NEXT: popl %ebx +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: not_is_minus_inf_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: movl %eax, %ecx +; X64-GISEL-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %edx, %edx +; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X64-GISEL-NEXT: setb %sil +; X64-GISEL-NEXT: orb %dl, %sil +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: sete %dl +; X64-GISEL-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X64-GISEL-NEXT: seta %al +; X64-GISEL-NEXT: orb %dl, %al +; X64-GISEL-NEXT: orb %sil, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf" ret i1 %0 } define i1 @isfinite_f(float %x) nounwind { -; X86-LABEL: isfinite_f: -; X86: # %bb.0: # %entry -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: setl %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: isfinite_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setl %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: isfinite_f: ; X64: # %bb.0: # %entry @@ -335,19 +545,39 @@ define i1 @isfinite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setl %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: isfinite_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: setb %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: isfinite_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: setb %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" ret i1 %0 } define i1 @not_isfinite_f(float %x) nounwind { -; X86-LABEL: not_isfinite_f: -; X86: # %bb.0: # %entry -; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; X86-NEXT: setge %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: not_isfinite_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-SDAGISEL-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-SDAGISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setge %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: not_isfinite_f: ; X64: # %bb.0: # %entry @@ -368,17 +598,43 @@ define i1 @not_isfinite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setge %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: not_isfinite_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: sete %dl +; X86-GISEL-NEXT: orb %cl, %dl +; X86-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-GISEL-NEXT: seta %al +; X86-GISEL-NEXT: orb %dl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: not_isfinite_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: sete %dl +; X64-GISEL-NEXT: orb %cl, %dl +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: seta %al +; X64-GISEL-NEXT: orb %dl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite" ret i1 %0 } define i1 @is_plus_finite_f(float %x) nounwind { -; X86-LABEL: is_plus_finite_f: -; X86: # %bb.0: # %entry -; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; X86-NEXT: setb %al -; X86-NEXT: retl +; X86-SDAGISEL-LABEL: is_plus_finite_f: +; X86-SDAGISEL: # %bb.0: # %entry +; X86-SDAGISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-SDAGISEL-NEXT: setb %al +; X86-SDAGISEL-NEXT: retl ; ; X64-LABEL: is_plus_finite_f: ; X64: # %bb.0: # %entry @@ -396,6 +652,23 @@ define i1 @is_plus_finite_f(float %x) nounwind { ; X86-FASTISEL-NEXT: setb %al ; X86-FASTISEL-NEXT: popl %ecx ; X86-FASTISEL-NEXT: retl +; +; X86-GISEL-LABEL: is_plus_finite_f: +; X86-GISEL: # %bb.0: # %entry +; X86-GISEL-NEXT: xorl %ecx, %ecx +; X86-GISEL-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-GISEL-NEXT: setb %al +; X86-GISEL-NEXT: orb %cl, %al +; X86-GISEL-NEXT: retl +; +; X64-GISEL-LABEL: is_plus_finite_f: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: xorl %ecx, %ecx +; X64-GISEL-NEXT: movd %xmm0, %eax +; X64-GISEL-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GISEL-NEXT: setb %al +; X64-GISEL-NEXT: orb %cl, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" ret i1 %0 @@ -418,6 +691,11 @@ define i1 @isnone_d(double %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: xorl %eax, %eax ; X86-FASTISEL-NEXT: retl +; +; X64-GISEL-LABEL: isnone_d: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: xorl %eax, %eax +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0) ret i1 %0 @@ -440,6 +718,11 @@ define i1 @isany_d(double %x) nounwind { ; X86-FASTISEL-NEXT: fstp %st(0) ; X86-FASTISEL-NEXT: movb $1, %al ; X86-FASTISEL-NEXT: retl +; +; X64-GISEL-LABEL: isany_d: +; X64-GISEL: # %bb.0: # %entry +; X64-GISEL-NEXT: movb $1, %al +; X64-GISEL-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023) ret i1 %0 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 4cde581..caec02e 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4765,6 +4765,66 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32 } declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32 immarg, <8 x i1>) +define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) { +; X64-LABEL: pr163023_sext: +; X64: # %bb.0: +; X64-NEXT: kxnorw %k0, %k0, %k1 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpgatherdd (%rdi,%zmm0), %zmm1 {%k1} +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: pr163023_sext: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kxnorw %k0, %k0, %k1 +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1} +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-NEXT: retl + %addr.p = ptrtoint ptr %a0 to i64 + %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0 + %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer + %ofs = sext <16 x i32> %a1 to <16 x i64> + %addr = add nuw <16 x i64> %addr.splat, %ofs + %ptr = inttoptr <16 x i64> %addr to <16 x ptr> + %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr, i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) + ret <16 x i32> %gather +} + +define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) { +; X64-LABEL: pr163023_zext: +; X64: # %bb.0: +; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-NEXT: kxnorw %k0, %k0, %k1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; X64-NEXT: kxnorw %k0, %k0, %k2 +; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2} +; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1} +; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: pr163023_zext: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kxnorw %k0, %k0, %k1 +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1} +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 +; X86-NEXT: retl + %addr.p = ptrtoint ptr %a0 to i64 + %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0 + %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer + %ofs = zext <16 x i32> %a1 to <16 x i64> + %addr = add nuw <16 x i64> %addr.splat, %ofs + %ptr = inttoptr <16 x i64> %addr to <16 x ptr> + %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr, i32 4, <16 x i1> splat (i1 true), <16 x i32> poison) + ret <16 x i32> %gather +} + ; ; PR45906 ; This used to cause fast-isel to generate bad copy instructions that would diff --git a/llvm/test/CodeGen/X86/pr160612.ll b/llvm/test/CodeGen/X86/pr160612.ll new file mode 100644 index 0000000..6572c42 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr160612.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s + +; Test for issue #160612: OR conditions in branches should use multiple branches +; instead of materializing booleans with SETCC when no special optimizations apply. + +declare void @subroutine_foo() +declare void @subroutine_bar() + +; Original issue: (x == 0 || y == 0) was generating SETCC + TEST + BRANCH +; instead of using two conditional branches directly. +define void @func_a(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: func_a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: je subroutine_foo@PLT # TAILCALL +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: jne subroutine_bar@PLT # TAILCALL +; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: jmp subroutine_foo@PLT # TAILCALL +entry: + %cmp = icmp eq i32 %x, 0 + %cmp1 = icmp eq i32 %y, 0 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.else + +if.then: + tail call void @subroutine_foo() + br label %if.end + +if.else: + tail call void @subroutine_bar() + br label %if.end + +if.end: + ret void +} + +; Reference implementation that already generated optimal code. +; This should continue to generate the same optimal code. +define void @func_b(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: func_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: je subroutine_foo@PLT # TAILCALL +; CHECK-NEXT: # %bb.1: # %if.else +; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: je subroutine_foo@PLT # TAILCALL +; CHECK-NEXT: # %bb.2: # %if.else3 +; CHECK-NEXT: jmp subroutine_bar@PLT # TAILCALL +entry: + %cmp = icmp eq i32 %x, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: + tail call void @subroutine_foo() + br label %if.end4 + +if.else: + %cmp1 = icmp eq i32 %y, 0 + br i1 %cmp1, label %if.then2, label %if.else3 + +if.then2: + tail call void @subroutine_foo() + br label %if.end4 + +if.else3: + tail call void @subroutine_bar() + br label %if.end4 + +if.end4: + ret void +} diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll index ea22b08..954ea8f 100644 --- a/llvm/test/CodeGen/X86/relptr-rodata.ll +++ b/llvm/test/CodeGen/X86/relptr-rodata.ll @@ -10,16 +10,31 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK: .long hidden-rodata @rodata = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @rodata to i64)) to i32) +; CHECK: .section .rodata.rodata_ptrtoaddr +; CHECK: rodata_ptrtoaddr: +; CHECK: .long hidden-rodata_ptrtoaddr +@rodata_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @rodata_ptrtoaddr to i64)) to i32) + ; CHECK: .section .data.rel.ro.relro1 ; CHECK: relro1: ; CHECK: .long default-relro1 @relro1 = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @default to i64), i64 ptrtoint (ptr @relro1 to i64)) to i32) +; CHECK: .section .data.rel.ro.relro1_ptrtoaddr +; CHECK: relro1_ptrtoaddr: +; CHECK: .long default-relro1_ptrtoaddr +@relro1_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @default to i64), i64 ptrtoaddr (ptr @relro1_ptrtoaddr to i64)) to i32) + ; CHECK: .section .data.rel.ro.relro2 ; CHECK: relro2: ; CHECK: .long hidden-relro2 @relro2 = constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @relro2 to i64)) to i32) +; CHECK: .section .data.rel.ro.relro2_ptrtoaddr +; CHECK: relro2_ptrtoaddr: +; CHECK: .long hidden-relro2_ptrtoaddr +@relro2_ptrtoaddr = constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @relro2_ptrtoaddr to i64)) to i32) + ; CHECK: .section .rodata.obj ; CHECK-NEXT: .globl obj ; CHECK: obj: diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index 5aa266d..d018c53 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -1447,3 +1447,175 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) { %r = icmp eq i512 %a, %b ret i1 %r } + +; Tests for any/allbits from memory. + +define i1 @anybits_i128_load_arg(ptr %w) { +; ANY-LABEL: anybits_i128_load_arg: +; ANY: # %bb.0: +; ANY-NEXT: movq (%rdi), %rax +; ANY-NEXT: orq 8(%rdi), %rax +; ANY-NEXT: setne %al +; ANY-NEXT: retq + %ld = load i128, ptr %w + %cmp = icmp ne i128 %ld, 0 + ret i1 %cmp +} + +define i1 @allbits_i128_load_arg(ptr %w) { +; SSE2-LABEL: allbits_i128_load_arg: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb (%rdi), %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq +; +; SSE41-LABEL: allbits_i128_load_arg: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: setb %al +; SSE41-NEXT: retq +; +; AVXANY-LABEL: allbits_i128_load_arg: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXANY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVXANY-NEXT: vptest %xmm1, %xmm0 +; AVXANY-NEXT: setb %al +; AVXANY-NEXT: retq + %ld = load i128, ptr %w + %cmp = icmp eq i128 %ld, -1 + ret i1 %cmp +} + +define i1 @anybits_i256_load_arg(ptr %w) { +; SSE-LABEL: anybits_i256_load_arg: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: orq 24(%rdi), %rcx +; SSE-NEXT: orq 16(%rdi), %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: setne %al +; SSE-NEXT: retq +; +; AVXANY-LABEL: anybits_i256_load_arg: +; AVXANY: # %bb.0: +; AVXANY-NEXT: vmovdqu (%rdi), %ymm0 +; AVXANY-NEXT: vptest %ymm0, %ymm0 +; AVXANY-NEXT: setne %al +; AVXANY-NEXT: vzeroupper +; AVXANY-NEXT: retq + %ld = load i256, ptr %w + %cmp = icmp ne i256 %ld, 0 + ret i1 %cmp +} + +define i1 @allbits_i256_load_arg(ptr %w) { +; SSE-LABEL: allbits_i256_load_arg: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq 8(%rdi), %rcx +; SSE-NEXT: andq 24(%rdi), %rcx +; SSE-NEXT: andq 16(%rdi), %rax +; SSE-NEXT: andq %rcx, %rax +; SSE-NEXT: cmpq $-1, %rax +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1-LABEL: allbits_i256_load_arg: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %ymm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vptest %ymm1, %ymm0 +; AVX1-NEXT: setb %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: allbits_i256_load_arg: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: setb %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: allbits_i256_load_arg: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: setb %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ld = load i256, ptr %w + %cmp = icmp eq i256 %ld, -1 + ret i1 %cmp +} + +define i1 @anybits_i512_load_arg(ptr %w) { +; NO512-LABEL: anybits_i512_load_arg: +; NO512: # %bb.0: +; NO512-NEXT: movq 16(%rdi), %rax +; NO512-NEXT: movq (%rdi), %rcx +; NO512-NEXT: movq 8(%rdi), %rdx +; NO512-NEXT: movq 24(%rdi), %rsi +; NO512-NEXT: orq 56(%rdi), %rsi +; NO512-NEXT: orq 40(%rdi), %rdx +; NO512-NEXT: orq %rsi, %rdx +; NO512-NEXT: orq 48(%rdi), %rax +; NO512-NEXT: orq 32(%rdi), %rcx +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: setne %al +; NO512-NEXT: retq +; +; AVX512-LABEL: anybits_i512_load_arg: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ld = load i512, ptr %w + %cmp = icmp ne i512 %ld, 0 + ret i1 %cmp +} + +define i1 @allbits_i512_load_arg(ptr %w) { +; NO512-LABEL: allbits_i512_load_arg: +; NO512: # %bb.0: +; NO512-NEXT: movq 16(%rdi), %rax +; NO512-NEXT: movq (%rdi), %rcx +; NO512-NEXT: movq 8(%rdi), %rdx +; NO512-NEXT: movq 24(%rdi), %rsi +; NO512-NEXT: andq 56(%rdi), %rsi +; NO512-NEXT: andq 40(%rdi), %rdx +; NO512-NEXT: andq %rsi, %rdx +; NO512-NEXT: andq 48(%rdi), %rax +; NO512-NEXT: andq 32(%rdi), %rcx +; NO512-NEXT: andq %rax, %rcx +; NO512-NEXT: andq %rdx, %rcx +; NO512-NEXT: cmpq $-1, %rcx +; NO512-NEXT: sete %al +; NO512-NEXT: retq +; +; AVX512-LABEL: allbits_i512_load_arg: +; AVX512: # %bb.0: +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = -1 +; AVX512-NEXT: vpcmpneqd (%rdi), %zmm0, %k0 +; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ld = load i512, ptr %w + %cmp = icmp eq i512 %ld, -1 + ret i1 %cmp +} diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll index b2064b1..02d4d88 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll @@ -181,40 +181,38 @@ define zeroext i1 @segmentedStack(ptr readonly %vk1, ptr readonly %vk2, i64 %key ; CHECK-LABEL: segmentedStack: ; CHECK: ## %bb.0: ; CHECK-NEXT: cmpq %gs:816, %rsp -; CHECK-NEXT: jbe LBB3_6 +; CHECK-NEXT: jbe LBB3_7 ; CHECK-NEXT: LBB3_1: ## %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: testq %rdi, %rdi -; CHECK-NEXT: sete %al -; CHECK-NEXT: testq %rsi, %rsi -; CHECK-NEXT: sete %cl -; CHECK-NEXT: orb %al, %cl ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: sete %al -; CHECK-NEXT: testb %cl, %cl -; CHECK-NEXT: jne LBB3_4 -; CHECK-NEXT: ## %bb.2: ## %if.end4.i +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: je LBB3_5 +; CHECK-NEXT: ## %bb.2: ## %entry +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: je LBB3_5 +; CHECK-NEXT: ## %bb.3: ## %if.end4.i ; CHECK-NEXT: movq 8(%rdi), %rdx ; CHECK-NEXT: cmpq 8(%rsi), %rdx -; CHECK-NEXT: jne LBB3_5 -; CHECK-NEXT: ## %bb.3: ## %land.rhs.i.i +; CHECK-NEXT: jne LBB3_6 +; CHECK-NEXT: ## %bb.4: ## %land.rhs.i.i ; CHECK-NEXT: movq (%rsi), %rsi ; CHECK-NEXT: movq (%rdi), %rdi ; CHECK-NEXT: callq _memcmp ; CHECK-NEXT: testl %eax, %eax ; CHECK-NEXT: sete %al -; CHECK-NEXT: LBB3_4: ## %__go_ptr_strings_equal.exit +; CHECK-NEXT: LBB3_5: ## %__go_ptr_strings_equal.exit ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq -; CHECK-NEXT: LBB3_5: +; CHECK-NEXT: LBB3_6: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq -; CHECK-NEXT: LBB3_6: +; CHECK-NEXT: LBB3_7: ; CHECK-NEXT: movl $8, %r10d ; CHECK-NEXT: movl $0, %r11d ; CHECK-NEXT: callq ___morestack @@ -224,43 +222,41 @@ define zeroext i1 @segmentedStack(ptr readonly %vk1, ptr readonly %vk2, i64 %key ; NOCOMPACTUNWIND-LABEL: segmentedStack: ; NOCOMPACTUNWIND: # %bb.0: ; NOCOMPACTUNWIND-NEXT: cmpq %fs:112, %rsp -; NOCOMPACTUNWIND-NEXT: jbe .LBB3_6 +; NOCOMPACTUNWIND-NEXT: jbe .LBB3_7 ; NOCOMPACTUNWIND-NEXT: .LBB3_1: # %entry ; NOCOMPACTUNWIND-NEXT: pushq %rax ; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 16 -; NOCOMPACTUNWIND-NEXT: testq %rdi, %rdi -; NOCOMPACTUNWIND-NEXT: sete %al -; NOCOMPACTUNWIND-NEXT: testq %rsi, %rsi -; NOCOMPACTUNWIND-NEXT: sete %cl -; NOCOMPACTUNWIND-NEXT: orb %al, %cl ; NOCOMPACTUNWIND-NEXT: movq %rdi, %rax ; NOCOMPACTUNWIND-NEXT: orq %rsi, %rax ; NOCOMPACTUNWIND-NEXT: sete %al -; NOCOMPACTUNWIND-NEXT: testb %cl, %cl -; NOCOMPACTUNWIND-NEXT: jne .LBB3_4 -; NOCOMPACTUNWIND-NEXT: # %bb.2: # %if.end4.i +; NOCOMPACTUNWIND-NEXT: testq %rdi, %rdi +; NOCOMPACTUNWIND-NEXT: je .LBB3_5 +; NOCOMPACTUNWIND-NEXT: # %bb.2: # %entry +; NOCOMPACTUNWIND-NEXT: testq %rsi, %rsi +; NOCOMPACTUNWIND-NEXT: je .LBB3_5 +; NOCOMPACTUNWIND-NEXT: # %bb.3: # %if.end4.i ; NOCOMPACTUNWIND-NEXT: movq 8(%rdi), %rdx ; NOCOMPACTUNWIND-NEXT: cmpq 8(%rsi), %rdx -; NOCOMPACTUNWIND-NEXT: jne .LBB3_5 -; NOCOMPACTUNWIND-NEXT: # %bb.3: # %land.rhs.i.i +; NOCOMPACTUNWIND-NEXT: jne .LBB3_6 +; NOCOMPACTUNWIND-NEXT: # %bb.4: # %land.rhs.i.i ; NOCOMPACTUNWIND-NEXT: movq (%rsi), %rsi ; NOCOMPACTUNWIND-NEXT: movq (%rdi), %rdi ; NOCOMPACTUNWIND-NEXT: callq memcmp@PLT ; NOCOMPACTUNWIND-NEXT: testl %eax, %eax ; NOCOMPACTUNWIND-NEXT: sete %al -; NOCOMPACTUNWIND-NEXT: .LBB3_4: # %__go_ptr_strings_equal.exit +; NOCOMPACTUNWIND-NEXT: .LBB3_5: # %__go_ptr_strings_equal.exit ; NOCOMPACTUNWIND-NEXT: # kill: def $al killed $al killed $eax ; NOCOMPACTUNWIND-NEXT: popq %rcx ; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 8 ; NOCOMPACTUNWIND-NEXT: retq -; NOCOMPACTUNWIND-NEXT: .LBB3_5: +; NOCOMPACTUNWIND-NEXT: .LBB3_6: ; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 16 ; NOCOMPACTUNWIND-NEXT: xorl %eax, %eax ; NOCOMPACTUNWIND-NEXT: # kill: def $al killed $al killed $eax ; NOCOMPACTUNWIND-NEXT: popq %rcx ; NOCOMPACTUNWIND-NEXT: .cfi_def_cfa_offset 8 ; NOCOMPACTUNWIND-NEXT: retq -; NOCOMPACTUNWIND-NEXT: .LBB3_6: +; NOCOMPACTUNWIND-NEXT: .LBB3_7: ; NOCOMPACTUNWIND-NEXT: movl $8, %r10d ; NOCOMPACTUNWIND-NEXT: movl $0, %r11d ; NOCOMPACTUNWIND-NEXT: callq __morestack |