diff options
Diffstat (limited to 'llvm/test/CodeGen')
88 files changed, 6414 insertions, 4173 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir new file mode 100644 index 0000000..b9cde95 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64 -passes='print<gisel-value-tracking>' -filetype=null %s 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.0: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00010011 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:11111101 SignBits:6 + %0:_(s8) = G_CONSTANT i8 19 + %1:_(s8) = G_CONSTANT i8 224 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: CstZero +body: | + bb.0: + ; CHECK-LABEL: name: @CstZero + ; CHECK-NEXT: %0:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 255 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: ScalarVar +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: ScalarZero +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarZero + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_SMULH %0, %1 +... +--- +name: ScalarVarAbs +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVarAbs + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_ABS %0 + %2:_(s16) = G_SEXT %1 + %3:_(s16) = G_CONSTANT i16 1 + %4:_(s16) = G_SMULH %2, %3 +... +--- +name: SplatVecCst +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecCst + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8) + %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8) + %3:_(<vscale x 16 x s8>) = G_SMULH %1, %2 +... +--- +name: SplatVecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %4(s8) + %6:_(<vscale x 16 x s8>) = G_SMULH %1, %5 +... +--- +name: VecCst +body: | + bb.0: + ; CHECK-LABEL: name: @VecCst + ; CHECK-NEXT: %0:_ KnownBits:00011001 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100001 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %3:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 25 + %1:_(s8) = G_CONSTANT i8 225 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %3:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %4:_(<2 x s8>) = G_SMULH %2, %3 +... +--- +name: VecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @VecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %0:_(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_(<2 x s8>) = G_BUILD_VECTOR %4:_(s8), %4:_(s8) + %6:_(<2 x s8>) = G_SMULH %1, %5 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir new file mode 100644 index 0000000..debdbaa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64 -passes='print<gisel-value-tracking>' -filetype=null %s 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.0: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:00010011 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:00010000 SignBits:3 + %0:_(s8) = G_CONSTANT i8 19 + %1:_(s8) = G_CONSTANT i8 224 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: CstZero +body: | + bb.0: + ; CHECK-LABEL: name: @CstZero + ; CHECK-NEXT: %0:_ KnownBits:11111111 SignBits:8 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = G_CONSTANT i8 255 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: ScalarVar +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: ScalarZero +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarZero + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8 + ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 0 + %2:_(s8) = G_UMULH %0, %1 +... +--- +name: ScalarVarAbs +body: | + bb.0: + ; CHECK-LABEL: name: @ScalarVarAbs + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15 + ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_ABS %0 + %2:_(s16) = G_SEXT %1 + %3:_(s16) = G_CONSTANT i16 1 + %4:_(s16) = G_UMULH %2, %3 +... +--- +name: SplatVecCst +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecCst + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %3:_ KnownBits:11110100 SignBits:4 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8) + %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8) + %3:_(<vscale x 16 x s8>) = G_UMULH %1, %2 +... +--- +name: SplatVecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @SplatVecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:0000???? SignBits:4 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %4(s8) + %6:_(<vscale x 16 x s8>) = G_UMULH %1, %5 +... +--- +name: VecCst +body: | + bb.0: + ; CHECK-LABEL: name: @VecCst + ; CHECK-NEXT: %0:_ KnownBits:00011001 SignBits:3 + ; CHECK-NEXT: %1:_ KnownBits:11100001 SignBits:3 + ; CHECK-NEXT: %2:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %3:_ KnownBits:?????001 SignBits:3 + ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1 + %0:_(s8) = G_CONSTANT i8 25 + %1:_(s8) = G_CONSTANT i8 225 + %2:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %3:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8) + %4:_(<2 x s8>) = G_UMULH %2, %3 +... +--- +name: VecPartScalar +body: | + bb.0: + ; CHECK-LABEL: name: @VecPartScalar + ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4 + ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4 + ; CHECK-NEXT: %6:_ KnownBits:0000???? SignBits:4 + %0:_(s8) = G_CONSTANT i8 250 + %1:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %0:_(s8) + %2:_(s8) = G_IMPLICIT_DEF + %3:_(s8) = G_CONSTANT i8 15 + %4:_(s8) = G_AND %2, %3 + %5:_(<2 x s8>) = G_BUILD_VECTOR %4:_(s8), %4:_(s8) + %6:_(<2 x s8>) = G_UMULH %1, %5 +... diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll index 322a96a..e8e5631 100644 --- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll @@ -739,14 +739,12 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) { ; ; GISEL-LABEL: postidx32_shalf: ; GISEL: ; %bb.0: -; GISEL-NEXT: movi d1, #0000000000000000 -; GISEL-NEXT: ldr h2, [x0], #4 +; GISEL-NEXT: ldr h1, [x0], #4 ; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0 ; GISEL-NEXT: fmov w9, s0 -; GISEL-NEXT: fcvt s3, h2 -; GISEL-NEXT: fmov w8, s2 -; GISEL-NEXT: fcvt s1, h1 -; GISEL-NEXT: fcmp s3, s1 +; GISEL-NEXT: fcvt s2, h1 +; GISEL-NEXT: fmov w8, s1 +; GISEL-NEXT: fcmp s2, #0.0 ; GISEL-NEXT: csel w8, w8, w9, mi ; GISEL-NEXT: strh w8, [x1] ; GISEL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll index b234ef7..085170c 100644 --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -782,18 +782,16 @@ define void @test_fccmp(half %in, ptr %out) { ; ; CHECK-CVT-GI-LABEL: test_fccmp: ; CHECK-CVT-GI: // %bb.0: -; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0 ; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-GI-NEXT: fcvt s2, h0 -; CHECK-CVT-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] -; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_1 -; CHECK-CVT-GI-NEXT: ldr h4, [x8, :lo12:.LCPI29_1] +; CHECK-CVT-GI-NEXT: fcvt s1, h0 +; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000 +; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0 +; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000 +; CHECK-CVT-GI-NEXT: fcmp s1, s2 +; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0] ; CHECK-CVT-GI-NEXT: fmov w8, s0 -; CHECK-CVT-GI-NEXT: fcvt s3, h1 -; CHECK-CVT-GI-NEXT: fmov w9, s1 -; CHECK-CVT-GI-NEXT: fcvt s4, h4 -; CHECK-CVT-GI-NEXT: fcmp s2, s3 -; CHECK-CVT-GI-NEXT: fccmp s2, s4, #4, mi +; CHECK-CVT-GI-NEXT: fmov w9, s2 +; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi ; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt ; CHECK-CVT-GI-NEXT: strh w8, [x0] ; CHECK-CVT-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll index 7409bfb..743d160 100644 --- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll +++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll @@ -149,33 +149,21 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) { } define i32 @fcvtzs_f16_i32_7(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_f16_i32_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI8_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI8_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0 @@ -189,33 +177,21 @@ define i32 @fcvtzs_f16_i32_7(half %flt) { } define i32 @fcvtzs_f16_i32_15(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_f16_i32_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI9_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI9_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0 @@ -229,33 +205,21 @@ define i32 @fcvtzs_f16_i32_15(half %flt) { } define i64 @fcvtzs_f16_i64_7(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_f16_i64_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI10_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI10_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0 @@ -269,33 +233,21 @@ define i64 @fcvtzs_f16_i64_7(half %flt) { } define i64 @fcvtzs_f16_i64_15(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_f16_i64_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI11_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI11_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0 @@ -453,33 +405,21 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) { } define i32 @fcvtzu_f16_i32_7(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_f16_i32_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI20_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI20_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0 @@ -493,33 +433,21 @@ define i32 @fcvtzu_f16_i32_7(half %flt) { } define i32 @fcvtzu_f16_i32_15(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_f16_i32_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI21_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI21_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0 @@ -533,33 +461,21 @@ define i32 @fcvtzu_f16_i32_15(half %flt) { } define i64 @fcvtzu_f16_i64_7(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_f16_i64_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI22_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI22_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0 @@ -573,33 +489,21 @@ define i64 @fcvtzu_f16_i64_7(half %flt) { } define i64 @fcvtzu_f16_i64_15(half %flt) { -; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_f16_i64_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI23_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI23_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0 @@ -774,13 +678,11 @@ define half @scvtf_f16_i32_7(i32 %int) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s0, w0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI32_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NO16-NEXT: scvtf s1, w0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -814,13 +716,11 @@ define half @scvtf_f16_i32_15(i32 %int) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s0, w0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI33_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NO16-NEXT: scvtf s1, w0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -854,13 +754,11 @@ define half @scvtf_f16_i64_7(i64 %long) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s0, x0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI34_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NO16-NEXT: scvtf s1, x0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -894,13 +792,11 @@ define half @scvtf_f16_i64_15(i64 %long) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s0, x0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI35_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NO16-NEXT: scvtf s1, x0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1078,13 +974,11 @@ define half @ucvtf_f16_i32_7(i32 %int) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s0, w0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI44_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI44_0] +; CHECK-GI-NO16-NEXT: ucvtf s1, w0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1118,13 +1012,11 @@ define half @ucvtf_f16_i32_15(i32 %int) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s0, w0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI45_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI45_0] +; CHECK-GI-NO16-NEXT: ucvtf s1, w0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1158,13 +1050,11 @@ define half @ucvtf_f16_i64_7(i64 %long) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s0, x0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI46_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI46_0] +; CHECK-GI-NO16-NEXT: ucvtf s1, x0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1198,13 +1088,11 @@ define half @ucvtf_f16_i64_15(i64 %long) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s0, x0 -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI47_0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI47_0] +; CHECK-GI-NO16-NEXT: ucvtf s1, x0 +; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 +; CHECK-GI-NO16-NEXT: fcvt h1, s1 ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 +; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1356,33 +1244,21 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) { } define i32 @fcvtzs_sat_f16_i32_7(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI55_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI55_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0 @@ -1396,33 +1272,21 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) { } define i32 @fcvtzs_sat_f16_i32_15(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI56_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI56_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0 @@ -1436,33 +1300,21 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) { } define i64 @fcvtzs_sat_f16_i64_7(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI57_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI57_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0 @@ -1476,33 +1328,21 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) { } define i64 @fcvtzs_sat_f16_i64_15(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzs x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI58_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI58_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0 @@ -1650,33 +1490,21 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) { } define i32 @fcvtzu_sat_f16_i32_7(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI66_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI66_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0 @@ -1690,33 +1518,21 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) { } define i32 @fcvtzu_sat_f16_i32_15(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu w0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI67_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI67_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0 @@ -1730,33 +1546,21 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) { } define i64 @fcvtzu_sat_f16_i64_7(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI68_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI68_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0 @@ -1770,33 +1574,21 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) { } define i64 @fcvtzu_sat_f16_i64_15(half %dbl) { -; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15: -; CHECK-SD-NO16: // %bb.0: -; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 -; CHECK-SD-NO16-NEXT: fcvt h0, s0 -; CHECK-SD-NO16-NEXT: fcvt s0, h0 -; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 -; CHECK-SD-NO16-NEXT: ret +; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15: +; CHECK-NO16: // %bb.0: +; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fmul s0, s0, s1 +; CHECK-NO16-NEXT: fcvt h0, s0 +; CHECK-NO16-NEXT: fcvt s0, h0 +; CHECK-NO16-NEXT: fcvtzu x0, s0 +; CHECK-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; -; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15: -; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: adrp x8, .LCPI69_0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI69_0] -; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 -; CHECK-GI-NO16-NEXT: fcvt h0, s0 -; CHECK-GI-NO16-NEXT: fcvt s0, h0 -; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 -; CHECK-GI-NO16-NEXT: ret -; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0 @@ -1811,4 +1603,3 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) { ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} ; CHECK-FP16: {{.*}} -; CHECK-NO16: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll index 98276b6..e1bc742 100644 --- a/llvm/test/CodeGen/AArch64/frem-power2.ll +++ b/llvm/test/CodeGen/AArch64/frem-power2.ll @@ -100,9 +100,8 @@ define half @hrem2_nsz(half %x) { ; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w30, -16 -; CHECK-GI-NEXT: fmov h1, #2.00000000 ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: fmov s1, #2.00000000 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll index 467ceb0..a373004 100644 --- a/llvm/test/CodeGen/AArch64/pr58431.ll +++ b/llvm/test/CodeGen/AArch64/pr58431.ll @@ -9,7 +9,7 @@ define i32 @f(i64 %0) { ; CHECK-NEXT: mov w10, #10 // =0xa ; CHECK-NEXT: eor x8, x8, #0x8000000000000003 ; CHECK-NEXT: umulh x8, x9, x8 -; CHECK-NEXT: msub x0, x8, x10, x9 +; CHECK-NEXT: umsubl x0, w8, w10, x9 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %2 = trunc i64 %0 to i32 diff --git a/llvm/test/CodeGen/AArch64/sve-asrd.ll b/llvm/test/CodeGen/AArch64/sve-asrd.ll new file mode 100644 index 0000000..66db1a5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-asrd.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mattr=+sve -combiner-disabled < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; Ensure we don't try to represent sdiv-by-one using ARSD. +define <16 x i16> @sdiv_by_one_v16i16(<16 x i16> %a) vscale_range(2,2) { +; CHECK-LABEL: sdiv_by_one_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8] +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sunpklo z3.s, z2.h +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #16 +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z3.s +; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: movprfx z1, z0 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: ret + %res = sdiv <16 x i16> %a, splat(i16 1) + ret <16 x i16> %res +} + +; Ensure we don't try to represent sdiv-by-one using ARSD. +define <vscale x 8 x i16> @sdiv_by_one_nxv8i16(<vscale x 8 x i16> %a) { +; CHECK-LABEL: sdiv_by_one_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.h, #1 // =0x1 +; CHECK-NEXT: sunpkhi z2.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sunpkhi z3.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %res = sdiv <vscale x 8 x i16> %a, splat(i16 1) + ret <vscale x 8 x i16> %res +} diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index f4ae66a..4ad5b38 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -70,6 +70,12 @@ define half @t3(half %x) { ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; +; USE-NEON-NO-GPRS-LABEL: t3: +; USE-NEON-NO-GPRS: // %bb.0: // %entry +; USE-NEON-NO-GPRS-NEXT: fcvtzs h0, h0 +; USE-NEON-NO-GPRS-NEXT: scvtf h0, h0 +; USE-NEON-NO-GPRS-NEXT: ret +; ; NONEON-NOSVE-LABEL: t3: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 @@ -147,6 +153,12 @@ define half @t6(half %x) { ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; +; USE-NEON-NO-GPRS-LABEL: t6: +; USE-NEON-NO-GPRS: // %bb.0: // %entry +; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0 +; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0 +; USE-NEON-NO-GPRS-NEXT: ret +; ; NONEON-NOSVE-LABEL: t6: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll index 139ecaf..67197b3fe 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll @@ -231,6 +231,274 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval } +define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv6f16(<vscale x 6 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv6f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave3.nxv6f16(<vscale x 6 x half> %vec) + ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %retval +} + +define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv12f16(<vscale x 12 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv12f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave3.nxv12f16(<vscale x 12 x half> %vec) + ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %retval +} + +define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv24f16(<vscale x 24 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv24f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave3.nxv24f16(<vscale x 24 x half> %vec) + ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %retval +} + +define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv6f32(<vscale x 6 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv6f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave3.nxv6f32(<vscale x 6 x float> %vec) + ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %retval +} + +define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv12f32(<vscale x 12 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv12f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave3.nxv12f32(<vscale x 12 x float> %vec) + ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %retval +} + +define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv6f64(<vscale x 6 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv6f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave3.nxv6f64(<vscale x 6 x double> %vec) + ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %retval +} + +define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv6bf16(<vscale x 6 x bfloat> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv6bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave3.nxv6bf16(<vscale x 6 x bfloat> %vec) + ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval +} + +define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv12bf16(<vscale x 12 x bfloat> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv12bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z1, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave3.nxv12bf16(<vscale x 12 x bfloat> %vec) + ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval +} + +define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv24bf16(<vscale x 24 x bfloat> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv24bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave3.nxv24bf16(<vscale x 24 x bfloat> %vec) + ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %retval +} + +; Integers + +define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv48i8(<vscale x 48 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave3.nxv48i8(<vscale x 48 x i8> %vec) + ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval +} + +define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv24i16(<vscale x 24 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave3.nxv24i16(<vscale x 24 x i16> %vec) + ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval +} + +define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv12i32(<vscale x 12 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv12i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %vec) + ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval +} + +define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv6i64(<vscale x 6 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: str z2, [sp, #2, mul vl] +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> %vec) + ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval +} + define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) { ; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8: ; SVE: // %bb.0: @@ -599,31 +867,3 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv %retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec) ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval } - -; Floating declarations -declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>) -declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>) -declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>) -declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>) -declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>) -declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>) - -; Integer declarations -declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>) -declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>) -declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>) -declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>) - -; Predicated declarations -declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>) -declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>) -declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>) -declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>) - -; Illegal size type -declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>) -declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>) - -declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>) -declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>) -declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>) diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll index c7fb2db..49f185c 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -221,6 +221,318 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale ret <vscale x 4 x i64> %retval } +define <vscale x 6 x half> @interleave3_nxv6f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1, <vscale x 2 x half> %vec2) { +; CHECK-LABEL: interleave3_nxv6f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1, <vscale x 2 x half> %vec2) + ret <vscale x 6 x half> %retval +} + +define <vscale x 12 x half> @interleave3_nxv12f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1, <vscale x 4 x half> %vec2) { +; CHECK-LABEL: interleave3_nxv12f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1, <vscale x 4 x half> %vec2) + ret <vscale x 12 x half> %retval +} + +define <vscale x 24 x half> @interleave3_nxv24f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1, <vscale x 8 x half> %vec2) { +; CHECK-LABEL: interleave3_nxv24f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1, <vscale x 8 x half> %vec2) + ret <vscale x 24 x half> %retval +} + +define <vscale x 6 x float> @interleave3_nxv6f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1, <vscale x 2 x float> %vec2) { +; CHECK-LABEL: interleave3_nxv6f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: st1w { z2.d }, p0, [x8, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1, <vscale x 2 x float> %vec2) + ret <vscale x 6 x float> %retval +} + +define <vscale x 12 x float> @interleave3_nxv12f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2) { +; CHECK-LABEL: interleave3_nxv12f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2) + ret <vscale x 12 x float> %retval +} + +define <vscale x 6 x double> @interleave3_nxv6f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1, <vscale x 2 x double> %vec2) { +; CHECK-LABEL: interleave3_nxv6f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 6 x double>@llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1, <vscale x 2 x double> %vec2) + ret <vscale x 6 x double> %retval +} + +define <vscale x 6 x bfloat> @interleave3_nxv6bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1, <vscale x 2 x bfloat> %vec2) { +; CHECK-LABEL: interleave3_nxv6bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #2, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1, <vscale x 2 x bfloat> %vec2) + ret <vscale x 6 x bfloat> %retval +} + +define <vscale x 12 x bfloat> @interleave3_nxv12bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1, <vscale x 4 x bfloat> %vec2) { +; CHECK-LABEL: interleave3_nxv12bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addpl x8, sp, #4 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp, #1, mul vl] +; CHECK-NEXT: ldr z1, [sp] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl] +; CHECK-NEXT: str z0, [sp, #3, mul vl] +; CHECK-NEXT: ldr z1, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp, #3, mul vl] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1, <vscale x 4 x bfloat> %vec2) + ret <vscale x 12 x bfloat> %retval +} + +define <vscale x 24 x bfloat> @interleave3_nxv24bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1, <vscale x 8 x bfloat> %vec2) { +; CHECK-LABEL: interleave3_nxv24bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1, <vscale x 8 x bfloat> %vec2) + ret <vscale x 24 x bfloat> %retval +} + +; Integers + +define <vscale x 48 x i8> @interleave3_nxv48i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2) { +; CHECK-LABEL: interleave3_nxv48i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3b { z0.b - z2.b }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 48 x i8> @llvm.vector.interleave3.nxv48i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2) + ret <vscale x 48 x i8> %retval +} + +define <vscale x 24 x i16> @interleave3_nxv24i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2) { +; CHECK-LABEL: interleave3_nxv24i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 24 x i16> @llvm.vector.interleave3.nxv24i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2) + ret <vscale x 24 x i16> %retval +} + +define <vscale x 12 x i32> @interleave3_nxv12i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2) { +; CHECK-LABEL: interleave3_nxv12i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2) + ret <vscale x 12 x i32> %retval +} + +define <vscale x 6 x i64> @interleave3_nxv6i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2) { +; CHECK-LABEL: interleave3_nxv6i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2 +; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z2, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %retval = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2) + ret <vscale x 6 x i64> %retval +} + define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) { ; SVE-LABEL: interleave4_nxv16i8: ; SVE: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll index be07978..8e0328e 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll @@ -38,17 +38,11 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_v2HalfH: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0] -; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 -; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0 +; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: ret ; @@ -88,19 +82,13 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_v3HalfH: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] -; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] ; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2] -; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 +; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1 ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 ; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0 @@ -152,17 +140,11 @@ define half @add_HalfH(<4 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_HalfH: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI3_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI3_0] -; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 @@ -250,16 +232,10 @@ define half @add_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI4_0 +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI4_0] -; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 @@ -448,16 +424,10 @@ define half @add_2H(<16 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_2H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI7_0 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s3, h0 -; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI7_0] -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3 -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 -; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3 +; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2 ; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 ; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll index c10d6e9..716401e 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll @@ -52,17 +52,11 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: mul_HalfH: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0] ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 @@ -144,16 +138,10 @@ define half @mul_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: mul_H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0 +; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] -; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 @@ -321,16 +309,10 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: mul_2H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI5_0 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s3, h0 -; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI5_0] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3 -; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1] -; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 -; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 -; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3 +; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2 ; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 ; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll index c92e5c5..edb3607 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dereferenceable-declaration.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -o - %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji -stop-after=irtranslator -o - %s | FileCheck %s declare align(8) dereferenceable(8) ptr @declared_with_ret_deref() #0 declare align(8) ptr @unknown_decl() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 1aee6ab..1b879a6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -403,40 +403,38 @@ define half @v_neg_rcp_f16(half %x) { ; GFX6-IEEE-LABEL: v_neg_rcp_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -460,40 +458,38 @@ define half @v_rcp_f16(half %x) { ; GFX6-IEEE-LABEL: v_rcp_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -517,40 +513,38 @@ define half @v_rcp_f16_arcp(half %x) { ; GFX6-IEEE-LABEL: v_rcp_f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -575,9 +569,7 @@ define half @v_rcp_f16_arcp_afn(half %x) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -600,40 +592,38 @@ define half @v_rcp_f16_ulp25(half %x) { ; GFX6-IEEE-LABEL: v_rcp_f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -1454,70 +1444,67 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1526,30 +1513,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1561,26 +1545,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX8-FLUSH-LABEL: v_rcp_v2f16: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1594,30 +1575,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1628,26 +1606,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_rcp_v2f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1 +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1660,30 +1636,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -1696,24 +1669,21 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -1726,27 +1696,25 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2 +; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x @@ -1757,70 +1725,67 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1829,30 +1794,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1864,26 +1826,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1897,30 +1856,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1931,26 +1887,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1 +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1963,30 +1917,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -1999,24 +1950,21 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -2029,27 +1977,25 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2 +; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x @@ -2064,33 +2010,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2101,39 +2046,37 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2143,30 +2086,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2179,26 +2119,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2213,30 +2150,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2248,26 +2182,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2 +; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -2279,32 +2211,29 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -2316,26 +2245,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -2346,30 +2272,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX11-LABEL: v_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 -; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 +; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2386,33 +2312,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2423,39 +2348,37 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2465,30 +2388,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -2501,26 +2421,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -2535,30 +2452,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -2570,26 +2484,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2 +; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -2601,32 +2513,29 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -2638,26 +2547,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -2668,30 +2574,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX11-LABEL: v_neg_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mov_b32_e32 v5, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 -; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 +; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2704,70 +2610,67 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2814,11 +2717,8 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2864,70 +2764,67 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2936,30 +2833,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2971,26 +2865,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -3004,30 +2895,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -3038,26 +2926,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1 +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -3070,30 +2956,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -3106,24 +2989,21 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -3136,27 +3016,25 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2 +; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x @@ -4033,40 +3911,38 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) { ; GFX6-IEEE-LABEL: s_rcp_f16: ; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-IEEE-NEXT: ; return to shader part epilog ; ; GFX6-FLUSH-LABEL: s_rcp_f16: ; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 @@ -4099,40 +3975,38 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) { define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) { ; GFX6-IEEE-LABEL: s_neg_rcp_f16: ; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-IEEE-NEXT: ; return to shader part epilog ; ; GFX6-FLUSH-LABEL: s_neg_rcp_f16: ; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 @@ -4166,21 +4040,20 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) { ; GFX6-IEEE-LABEL: s_rsq_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-IEEE-NEXT: ; return to shader part epilog @@ -4188,24 +4061,23 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) { ; GFX6-FLUSH-LABEL: s_rsq_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-FLUSH-NEXT: ; return to shader part epilog @@ -4241,36 +4113,35 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1] -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4283,42 +4154,40 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4330,31 +4199,28 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16 ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 @@ -4369,25 +4235,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 @@ -4402,25 +4265,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9 -; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4434,25 +4294,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4466,25 +4324,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2 +; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4498,25 +4354,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4530,29 +4383,27 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2 +; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog @@ -4568,21 +4419,20 @@ define half @v_rsq_f16(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4590,24 +4440,23 @@ define half @v_rsq_f16(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4632,21 +4481,20 @@ define half @v_neg_rsq_f16(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4654,24 +4502,23 @@ define half @v_neg_rsq_f16(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4706,21 +4553,20 @@ define { half, half } @v_rsq_f16_multi_use(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4728,24 +4574,23 @@ define { half, half } @v_rsq_f16_multi_use(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4785,21 +4630,20 @@ define half @v_rsq_f16_missing_contract0(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4807,24 +4651,23 @@ define half @v_rsq_f16_missing_contract0(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4859,21 +4702,20 @@ define half @v_rsq_f16_missing_contract1(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4881,24 +4723,23 @@ define half @v_rsq_f16_missing_contract1(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4933,21 +4774,20 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4955,24 +4795,23 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5007,21 +4846,20 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5029,24 +4867,23 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5081,21 +4918,20 @@ define half @v_neg_rsq_f16_fabs(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5103,24 +4939,23 @@ define half @v_neg_rsq_f16_fabs(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5156,21 +4991,20 @@ define half @v_rsq_f16_arcp(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5178,24 +5012,23 @@ define half @v_rsq_f16_arcp(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5220,21 +5053,20 @@ define half @v_neg_rsq_f16_arcp(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5242,24 +5074,23 @@ define half @v_neg_rsq_f16_arcp(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5294,12 +5125,10 @@ define half @v_rsq_f16_afn(half %a) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -5324,12 +5153,10 @@ define half @v_rsq_f16_afn_nocontract(half %a) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -5365,36 +5192,35 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -5404,42 +5230,40 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5448,31 +5272,28 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 @@ -5486,25 +5307,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 @@ -5518,25 +5336,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7 -; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5549,25 +5364,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2 +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5580,25 +5393,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5611,25 +5422,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5642,7 +5450,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -5650,22 +5458,20 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2 +; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -5679,36 +5485,35 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -5718,42 +5523,40 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5762,31 +5565,28 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 +; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 @@ -5800,25 +5600,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6 -; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 @@ -5832,25 +5629,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7 -; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5863,25 +5657,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2 +; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5894,25 +5686,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2 +; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5925,25 +5715,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5956,7 +5743,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, -1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -5964,22 +5751,20 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2 +; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index e6a8bac..2356dad 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -new-reg-bank-select -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -new-reg-bank-select -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -new-reg-bank-select -mattr=-promote-alloca < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX942 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -new-reg-bank-select -mattr=-unaligned-access-mode -mattr=-promote-alloca < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 302b239..549af87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -88,11 +88,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_or_b32_e32 v1, s4, v0 ; CI-NEXT: .LBB0_8: ; %Flow19 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 ; CI-NEXT: s_cselect_b32 s2, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 ; CI-NEXT: s_and_b32 s2, 1, s2 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc @@ -1197,16 +1196,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_or_b32_e32 v1, s4, v1 ; CI-NEXT: .LBB9_16: ; %Flow54 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, 0 ; CI-NEXT: s_and_b32 s0, s0, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00 ; CI-NEXT: s_cselect_b32 s4, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 ; CI-NEXT: s_cselect_b32 s2, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; CI-NEXT: s_and_b32 s3, 1, s4 @@ -1730,26 +1728,25 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_or_b32_e32 v3, s1, v3 ; CI-NEXT: .LBB10_32: ; %Flow124 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v5, 0 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00 ; CI-NEXT: s_cselect_b32 s11, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 ; CI-NEXT: s_and_b32 s2, s6, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 ; CI-NEXT: s_cselect_b32 s6, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s3 ; CI-NEXT: s_and_b32 s4, s5, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 ; CI-NEXT: s_cselect_b32 s12, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5 +; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s10 ; CI-NEXT: s_and_b32 s7, s7, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00 ; CI-NEXT: s_cselect_b32 s7, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5 +; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_mov_b32_e32 v4, 0x7e00 ; CI-NEXT: s_and_b32 s10, 1, s11 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 920d8fa..ae7f6ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @v_insert_v64i32_37(ptr addrspace(1) %ptr.in, ptr addrspace(1) %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll index cfbb429..aabf256 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -1,11 +1,11 @@ -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-WGP %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,ALIGNED-CU %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED %s ; GCN-LABEL: test_local_misaligned_v2: ; GCN-DAG: ds_{{read2|load_2addr}}_b32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll index 66cdfc2..7b923f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll @@ -1,14 +1,14 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v4.ll ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -o %t.v6.ll -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s -; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa < %t.v4.ll | FileCheck --check-prefixes=ALL,HSA,UNPACKED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global < %t.v4.ll | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga < %t.v4.ll | FileCheck -check-prefixes=ALL,MESA3D,UNPACKED %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %t.v4.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx11-generic -amdgpu-enable-vopd=0 < %t.v6.ll | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir new file mode 100644 index 0000000..87836e2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-sext.mir @@ -0,0 +1,170 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: assert_sext_vgpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: assert_sext_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT %copy, 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32) + %copy:_(s32) = COPY $vgpr0 + %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_sgpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8 + + ; CHECK-LABEL: name: assert_sext_sgpr + ; CHECK: liveins: $sgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:sgpr(s32) = COPY $sgpr8 + ; CHECK-NEXT: %assert_sext:sgpr(s32) = G_ASSERT_SEXT %copy, 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32) + %copy:_(s32) = COPY $sgpr8 + %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_agpr +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0 + + ; CHECK-LABEL: name: assert_sext_agpr + ; CHECK: liveins: $agpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:vgpr(s32) = COPY $agpr0 + ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT %copy, 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32) + %copy:_(s32) = COPY $agpr0 + %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_vgpr_regclass +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: assert_sext_vgpr_regclass + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:vgpr_32(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %copy(s32) + ; CHECK-NEXT: %assert_sext:vgpr(s32) = G_ASSERT_SEXT [[COPY]], 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32) + %copy:vgpr_32(s32) = COPY $vgpr0 + %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_sgpr_regcllass +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr8 + + ; CHECK-LABEL: name: assert_sext_sgpr_regcllass + ; CHECK: liveins: $sgpr8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:sgpr_32(s32) = COPY $sgpr8 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY %copy(s32) + ; CHECK-NEXT: %assert_sext:sgpr(s32) = G_ASSERT_SEXT [[COPY]], 4 + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s32) + %copy:sgpr_32(s32) = COPY $sgpr8 + %assert_sext:_(s32) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_vgpr_64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: assert_sext_vgpr_64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:vreg_64(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64) + ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_SEXT [[COPY]], 4 + ; CHECK-NEXT: %assert_sext:vreg_64(s64) = COPY [[ASSERT_SEXT]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64) + %copy:vreg_64(s64) = COPY $vgpr0_vgpr1 + %assert_sext:vreg_64(s64) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_sgpr_64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: assert_sext_sgpr_64 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:sreg_64(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY %copy(s64) + ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:sgpr(s64) = G_ASSERT_SEXT [[COPY]], 4 + ; CHECK-NEXT: %assert_sext:sreg_64(s64) = COPY [[ASSERT_SEXT]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64) + %copy:sreg_64(s64) = COPY $sgpr0_sgpr1 + %assert_sext:sreg_64(s64) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... + +--- +name: assert_sext_agpr_64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1 + + ; CHECK-LABEL: name: assert_sext_agpr_64 + ; CHECK: liveins: $agpr0_agpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:areg_64(s64) = COPY $agpr0_agpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64) + ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_SEXT [[COPY]], 4 + ; CHECK-NEXT: %assert_sext:areg_64(s64) = COPY [[ASSERT_SEXT]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_sext(s64) + %copy:areg_64(s64) = COPY $agpr0_agpr1 + %assert_sext:areg_64(s64) = G_ASSERT_SEXT %copy, 4 + S_ENDPGM 0, implicit %assert_sext +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir index 0bce908..c64a8ec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-assert-zext.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s --- name: assert_zext_vgpr @@ -53,8 +53,8 @@ body: | ; CHECK-LABEL: name: assert_zext_agpr ; CHECK: liveins: $agpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %copy:agpr(s32) = COPY $agpr0 - ; CHECK-NEXT: %assert_zext:agpr(s32) = G_ASSERT_ZEXT %copy, 4 + ; CHECK-NEXT: %copy:vgpr(s32) = COPY $agpr0 + ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT %copy, 4 ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32) %copy:_(s32) = COPY $agpr0 %assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4 @@ -74,7 +74,8 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %copy:vgpr_32(s32) = COPY $vgpr0 - ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT %copy, 4 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY %copy(s32) + ; CHECK-NEXT: %assert_zext:vgpr(s32) = G_ASSERT_ZEXT [[COPY]], 4 ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32) %copy:vgpr_32(s32) = COPY $vgpr0 %assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4 @@ -94,9 +95,76 @@ body: | ; CHECK: liveins: $sgpr8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %copy:sgpr_32(s32) = COPY $sgpr8 - ; CHECK-NEXT: %assert_zext:sgpr(s32) = G_ASSERT_ZEXT %copy, 4 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY %copy(s32) + ; CHECK-NEXT: %assert_zext:sgpr(s32) = G_ASSERT_ZEXT [[COPY]], 4 ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s32) %copy:sgpr_32(s32) = COPY $sgpr8 %assert_zext:_(s32) = G_ASSERT_ZEXT %copy, 4 S_ENDPGM 0, implicit %assert_zext ... + +--- +name: assert_zext_vgpr_64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: assert_zext_vgpr_64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:vreg_64(s64) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64) + ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4 + ; CHECK-NEXT: %assert_zext:vreg_64(s64) = COPY [[ASSERT_ZEXT]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64) + %copy:vreg_64(s64) = COPY $vgpr0_vgpr1 + %assert_zext:vreg_64(s64) = G_ASSERT_ZEXT %copy, 4 + S_ENDPGM 0, implicit %assert_zext +... + +--- +name: assert_zext_sgpr_64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: assert_zext_sgpr_64 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:sreg_64(s64) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY %copy(s64) + ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:sgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4 + ; CHECK-NEXT: %assert_zext:sreg_64(s64) = COPY [[ASSERT_ZEXT]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64) + %copy:sreg_64(s64) = COPY $sgpr0_sgpr1 + %assert_zext:sreg_64(s64) = G_ASSERT_ZEXT %copy, 4 + S_ENDPGM 0, implicit %assert_zext +... + +--- +name: assert_zext_agpr_64 +alignment: 4 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0_agpr1 + + ; CHECK-LABEL: name: assert_zext_agpr_64 + ; CHECK: liveins: $agpr0_agpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %copy:areg_64(s64) = COPY $agpr0_agpr1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY %copy(s64) + ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:vgpr(s64) = G_ASSERT_ZEXT [[COPY]], 4 + ; CHECK-NEXT: %assert_zext:areg_64(s64) = COPY [[ASSERT_ZEXT]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit %assert_zext(s64) + %copy:areg_64(s64) = COPY $agpr0_agpr1 + %assert_zext:areg_64(s64) = G_ASSERT_ZEXT %copy, 4 + S_ENDPGM 0, implicit %assert_zext +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir index eee553e..a7023d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s --- name: smax_s32_ss @@ -188,8 +187,7 @@ body: | ; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]] ; CHECK-NEXT: [[SMAX1:%[0-9]+]]:sgpr(s32) = G_SMAX [[ASHR]], [[ASHR1]] ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMAX]](s32), [[SMAX1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir index ef60aa8..9dd5f45 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s --- name: smin_s32_ss @@ -191,8 +190,7 @@ body: | ; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16 - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32) + ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32) ; CHECK-NEXT: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]] ; CHECK-NEXT: [[SMIN1:%[0-9]+]]:sgpr(s32) = G_SMIN [[ASHR]], [[ASHR1]] ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMIN]](s32), [[SMIN1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir index 36a38aa..59d7dce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s --- name: umax_s32_ss @@ -186,15 +185,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; CHECK-NEXT: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[AND]], [[AND1]] ; CHECK-NEXT: [[UMAX1:%[0-9]+]]:sgpr(s32) = G_UMAX [[LSHR]], [[LSHR1]] ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMAX]](s32), [[UMAX1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir index bb232b5e..fdb05f6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s --- name: umin_s32_ss @@ -190,15 +189,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1 ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; CHECK-NEXT: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[AND]], [[AND1]] ; CHECK-NEXT: [[UMIN1:%[0-9]+]]:sgpr(s32) = G_UMIN [[LSHR]], [[LSHR1]] ; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMIN]](s32), [[UMIN1]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 9ffc565..4f2c454 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2537,202 +2537,195 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v7, v4 -; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_trunc_f32_e32 v5, v4 +; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 ; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 -; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8] -; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 +; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc -; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 +; GISEL-NEXT: v_trunc_f32_e32 v8, v6 +; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 ; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_mov_b32_e32 v2, v7 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 +; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] +; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc @@ -2743,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2755,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll index ac1e11b..dfa613c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define i32 @test_min_max_ValK0_K1_i32(i32 %a) { ; GFX89-LABEL: test_min_max_ValK0_K1_i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 82279e6..40b5db0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3035,203 +3035,193 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3 -; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 +; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 +; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_mov_b32_e32 v3, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4] -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] +; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 +; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc +; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1] -; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7] -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v9, v4 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6] -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5 -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v11, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 +; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 4de1078..ded985e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2053,90 +2053,82 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2 ; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2 ; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5 +; GISEL-NEXT: v_mul_hi_u32 v15, 0, v2 +; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5 ; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5 ; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v15 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7 -; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v1, v2 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v2 +; GISEL-NEXT: v_mul_hi_u32 v17, v1, v2 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2 ; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v17 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11 -; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], 0, v9, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11 -; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18 -; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11] -; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2 -; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9] -; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v14 +; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v14, s[10:11], 0, v19, s[10:11] +; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8 +; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[6:7] +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9] +; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5] ; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v14, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll index 2b54123..f5068f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s define i32 @test_min_max_ValK0_K1_u32(i32 %a) { ; GFX89-LABEL: test_min_max_ValK0_K1_u32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index a41ec8e..be5543b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2058,42 +2058,34 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 +; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4 -; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7 +; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2 -; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8 ; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc ; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10 ; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll index c94b333..1f36902 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll @@ -726,16 +726,16 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i ; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE815:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ] ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE016:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE117:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE016]], double [[TMP11]], i64 1 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE218:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE117]], double [[TMP12]], i64 2 @@ -746,8 +746,8 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE39:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE28]], double [[TMP8]], i64 3 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE410:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE39]], double [[TMP9]], i64 4 -; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1 ; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE420]], ptr [[OUT]], align 1 +; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1 ; CHECK-NEXT: ret void ; entry: @@ -1187,11 +1187,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr ; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE960:%.*]] = extractelement <5 x double> [[IN]], i64 4 ; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[COND5_END]] ; CHECK: cond5.end: -; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ] ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE046:%.*]] = insertelement <5 x double> poison, double [[TMP25]], i64 0 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE147:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE046]], double [[TMP26]], i64 1 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE248:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE147]], double [[TMP27]], i64 2 @@ -1204,11 +1204,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr ; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE859:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE450]], i64 4 ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ] -; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ] ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE061:%.*]] = insertelement <5 x double> poison, double [[TMP30]], i64 0 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE162:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE061]], double [[TMP31]], i64 1 ; CHECK-NEXT: [[LARGEPHI_INSERTSLICE263:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE162]], double [[TMP32]], i64 2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll index a4f9ce3..7ff86ac 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -2160,7 +2160,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 ; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]] +; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]]) +; IEEE-GOODFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]]) +; IEEE-GOODFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]]) +; IEEE-GOODFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]]) +; IEEE-GOODFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]]) +; IEEE-GOODFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0 +; IEEE-GOODFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]]) +; IEEE-GOODFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0 +; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 ; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -2231,7 +2246,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 ; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]] +; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]]) +; IEEE-BADFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]]) +; IEEE-BADFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]]) +; IEEE-BADFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]]) +; IEEE-BADFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]]) +; IEEE-BADFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0 +; IEEE-BADFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]]) +; IEEE-BADFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0 +; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 ; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -2258,7 +2288,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; DAZ-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) ; DAZ-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> splat (float 1.000000e+00), [[SQRT_X_NO_MD]] ; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2:![0-9]+]] +; DAZ-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP41:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP39]]) +; DAZ-NEXT: [[TMP42:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP40]]) +; DAZ-NEXT: [[TMP43:%.*]] = insertelement <2 x float> poison, float [[TMP41]], i64 0 +; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = insertelement <2 x float> [[TMP43]], float [[TMP42]], i64 1 ; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 ; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 ; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -2276,7 +2311,9 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1 ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP44]]) ; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]]) ; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 ; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 @@ -2290,7 +2327,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo ; DAZ-NEXT: [[TMP26:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0 ; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP26]], float [[TMP25]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]] +; DAZ-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP36:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP34]]) +; DAZ-NEXT: [[TMP37:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP35]]) +; DAZ-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP36]], i64 0 +; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 ; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 ; DAZ-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 ; DAZ-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[X]], i64 0 @@ -3200,9 +3242,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) { ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP17:%.*]] = fneg contract float [[TMP13]] -; DAZ-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]]) +; DAZ-NEXT: [[TMP45:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP45]] ; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]]) ; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 ; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 @@ -3675,9 +3721,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar ; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 ; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 ; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]]) -; DAZ-NEXT: [[TMP17:%.*]] = fneg arcp contract float [[TMP13]] -; DAZ-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP26]]) +; DAZ-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; DAZ-NEXT: [[TMP18:%.*]] = fneg arcp contract float [[TMP29]] ; DAZ-NEXT: [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) ; DAZ-NEXT: [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]] ; DAZ-NEXT: [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]]) @@ -3850,19 +3900,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float ; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 ; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 ; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) -; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP50]] -; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]]) -; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]] ; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) ; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 ; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1 @@ -3903,19 +3943,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float ; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 ; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 ; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] -; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) -; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]]) -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]] -; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]]) -; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]] ; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) ; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0 ; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]]) @@ -3956,9 +3986,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float ; DAZ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 ; DAZ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; DAZ-NEXT: [[TMP13:%.*]] = fneg contract float [[TMP9]] -; DAZ-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; DAZ-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; DAZ-NEXT: [[TMP14:%.*]] = fneg contract float [[TMP13]] ; DAZ-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]]) ; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0 ; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1 diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 4b14dc6..7ee0015f 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -21204,18 +21204,14 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fabs_bf16: @@ -21440,10 +21436,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fneg_fabs_bf16: @@ -21451,10 +21444,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fneg_fabs_bf16: @@ -21510,23 +21500,17 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_fabs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX7-NEXT: s_bitset0_b32 s0, 31 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_fabs_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll new file mode 100644 index 0000000..f466513 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; Make sure the coalescer doesn't introduce any uses of +; vreg_1024. None are available to allocate with the register budget +; of this function. + +define void @no_introduce_vreg_1024() #0 { +; CHECK-LABEL: no_introduce_vreg_1024: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_mov_b32_e32 v9, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use v[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_setpc_b64 s[30:31] + %tuple = call <8 x i32> asm sideeffect "; def $0","=v"() + %sub0 = extractelement <8 x i32> %tuple, i32 0 + %insert = insertelement <16 x i32> poison, i32 %sub0, i32 9 + call void asm sideeffect "; use $0","v"(<16 x i32> %insert) + ret void +} + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" } diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir new file mode 100644 index 0000000..1f414eb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir @@ -0,0 +1,34 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s + +# The register budget for this function does not permit using 1024-bit +# registers. The coalescer should not introduce a 1024-bit virtual +# register which will fail to allocate. + +--- | + define void @no_introduce_vreg_1024() #0 { + ret void + } + + attributes #0 = { "amdgpu-waves-per-eu"="10,10" } +... +--- +name: no_introduce_vreg_1024 +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 10 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: no_introduce_vreg_1024 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub9:vreg_512 = COPY [[COPY]].sub0 + ; CHECK-NEXT: SI_RETURN implicit [[COPY1]] + %0:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + undef %1.sub9:vreg_512 = COPY %0.sub0 + SI_RETURN implicit %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index 5d184b1..c46fcde 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -218,19 +218,11 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s4, s3, 0xffff0000 -; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: s_and_b32 s5, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4| -; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3| -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5| -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2| -; CI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -537,16 +529,15 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e64 v4, 1.0, |v3| -; CI-NEXT: v_mul_f32_e64 v5, 1.0, |v2| -; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: v_mul_f32_e32 v3, v4, v3 -; CI-NEXT: v_mul_f32_e32 v2, v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v2, v2, v5 +; CI-NEXT: v_mul_f32_e32 v3, v3, v4 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -898,16 +889,13 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 { ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1| -; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0| -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: flat_store_short v[0:1], v1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v0 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll index 3983655..38239c5 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -1634,29 +1634,18 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: ; IR-IEEE-SDAG: ; %bb.0: ; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000 -; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000 ; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0 -; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0 -; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0 -; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2 ; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 ; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 ; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 ; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 @@ -1668,24 +1657,14 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: ; IR-IEEE-GISEL: ; %bb.0: ; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 -; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 +; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 ; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0 -; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1 -; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0 -; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1 -; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0 -; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] -; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] -; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; IR-IEEE-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1 +; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc +; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 ; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 ; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 @@ -1705,75 +1684,24 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) { ; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 ; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] ; -; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract: -; IR-DAZ-SDAG: ; %bb.0: -; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1 -; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; IR-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract: -; IR-DAZ-GISEL: ; %bb.0: -; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0 -; IR-DAZ-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260 -; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 -; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 -; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 -; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 -; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 -; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract: +; IR-DAZ: ; %bb.0: +; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0 %fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0 ret float %fdiv diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 9233f80..9e15225 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -7464,18 +7464,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0 -; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -7639,27 +7636,24 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 ; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0 ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -8712,12 +8706,10 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0 -; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0 -; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8796,17 +8788,15 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0 -; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 -; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 +; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3 -; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 64a9727..76da0aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -107,12 +107,10 @@ define amdgpu_kernel void @fneg_fabs_fmul_bf16(ptr addrspace(1) %out, bfloat %x, ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 +; CI-NEXT: s_lshl_b32 s3, s2, 16 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_mul_f32_e32 v0, s2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_mul_f32_e64 v0, s2, -|v0| ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -204,12 +202,10 @@ define amdgpu_kernel void @fneg_fabs_free_bf16(ptr addrspace(1) %out, i16 %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -279,12 +275,10 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -345,43 +339,22 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) { } define amdgpu_kernel void @v_fneg_fabs_bf16(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; CI-LABEL: v_fneg_fabs_bf16: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_ushort v2, v[0:1] -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |v2| -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: flat_store_short v[0:1], v2 -; CI-NEXT: s_endpgm -; -; VI-LABEL: v_fneg_fabs_bf16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x8000, v2 -; VI-NEXT: flat_store_short v[0:1], v2 -; VI-NEXT: s_endpgm +; CIVI-LABEL: v_fneg_fabs_bf16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm ; ; GFX9-LABEL: v_fneg_fabs_bf16: ; GFX9: ; %bb.0: @@ -431,21 +404,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_add_f32_e64 v0, s3, 2.0 -; CI-NEXT: v_add_f32_e64 v1, s2, 1.0 -; CI-NEXT: v_readfirstlane_b32 s2, v0 +; CI-NEXT: s_lshl_b32 s3, s2, 16 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: s_bitset0_b32 s2, 31 -; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 -; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: s_lshr_b32 s2, s2, 16 -; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16 +; CI-NEXT: v_add_f32_e64 v1, s2, 2.0 +; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -566,15 +531,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff -; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: s_lshl_b32 s2, s3, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -629,27 +589,11 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s4, s2, 16 -; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2| -; CI-NEXT: s_and_b32 s2, s3, 0xffff0000 -; CI-NEXT: s_lshl_b32 s5, s3, 16 -; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2| -; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4| -; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s5| -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; CI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; CI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; CI-NEXT: s_or_b32 s3, s3, 0x80008000 +; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -860,21 +804,20 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0, ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff -; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2 -; CI-NEXT: s_lshl_b32 s1, s1, 16 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1 -; CI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_or_b32 s2, s0, 0x8000 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s1, s1, s2 +; CI-NEXT: s_bitset1_b32 s1, 31 +; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_store_dword v[0:1], v5 -; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: v_mov_b32_e32 v0, s1 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_multi_use_fabs_v2bf16: @@ -1086,5 +1029,3 @@ declare <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat>) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CIVI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index d232693..98044a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -14,11 +14,10 @@ define amdgpu_kernel void @s_fneg_bf16(ptr addrspace(1) %out, bfloat %in) #0 { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -93,9 +92,7 @@ define amdgpu_kernel void @v_fneg_bf16(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_ushort v2, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -170,11 +167,10 @@ define amdgpu_kernel void @s_fneg_free_bf16(ptr addrspace(1) %out, i16 %in) #0 { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -248,9 +244,9 @@ define amdgpu_kernel void @v_fneg_fold_bf16(ptr addrspace(1) %out, ptr addrspace ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v3, -1.0, v2 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_mul_f32_e32 v2, v3, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; CI-NEXT: flat_store_short v[0:1], v2 @@ -365,13 +361,13 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s2, s2, s3 +; CI-NEXT: s_add_i32 s2, s2, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -426,16 +422,16 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-NEXT: ; def s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s2, s2, s3 +; CI-NEXT: s_add_i32 s2, s2, 0x80000000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -501,13 +497,11 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v3, -1.0, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 +; CI-NEXT: v_bfi_b32 v2, s0, v3, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -570,13 +564,13 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s2, s2, s3 +; CI-NEXT: s_add_i32 s2, s2, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -637,16 +631,14 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v4, -1.0, v3 -; CI-NEXT: v_mul_f32_e32 v5, -1.0, v2 -; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: v_mul_f32_e32 v3, v4, v3 -; CI-NEXT: v_mul_f32_e32 v2, v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e64 v2, -v2, v2 +; CI-NEXT: v_mul_f32_e32 v3, v3, v4 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -912,12 +904,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) # ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e32 v1, -1.0, v1 -; CI-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: flat_store_short v[0:1], v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll new file mode 100644 index 0000000..49204f8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) { +; GFX6-LABEL: fptoui_f32_to_i16_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i16 + %fp = uitofp i16 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) { +; GFX6-LABEL: fptoui_f32_to_i32_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i32_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i32 + %fp = uitofp i32 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) { +; GFX6-LABEL: fptoui_f32_to_i64_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i64_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i64 + %fp = uitofp i64 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) { +; GFX6-LABEL: fptoui_f16_to_i16_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i16_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i16 + %fp = uitofp i16 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) { +; GFX6-LABEL: fptoui_f16_to_i32_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i32 + %fp = uitofp i32 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) { +; GFX6-LABEL: fptoui_f16_to_i64_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i64_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i64 + %fp = uitofp i64 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) { +; GFX6-LABEL: fptoui_f64_to_i16_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i16_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i16 + %fp = uitofp i16 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) { +; GFX6-LABEL: fptoui_f64_to_i32_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i32_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i32 + %fp = uitofp i32 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) { +; GFX6-LABEL: fptoui_f64_to_i64_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s5, 0xfffff +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_not_b32_e32 v0, 31 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s7, s3, 0xb0014 +; GFX6-NEXT: s_addk_i32 s7, 0xfc01 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; GFX6-NEXT: s_and_b32 s8, s3, 0x80000000 +; GFX6-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] +; GFX6-NEXT: s_cmp_lt_i32 s7, 0 +; GFX6-NEXT: s_cselect_b32 s4, 0, s4 +; GFX6-NEXT: s_cselect_b32 s5, s8, s5 +; GFX6-NEXT: s_cmp_gt_i32 s7, 51 +; GFX6-NEXT: s_cselect_b32 s3, s3, s5 +; GFX6-NEXT: s_cselect_b32 s2, s2, s4 +; GFX6-NEXT: v_ldexp_f64 v[0:1], s[2:3], v0 +; GFX6-NEXT: v_mov_b32_e32 v4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3 +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3] +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] +; GFX6-NEXT: v_cvt_u32_f64_e32 v2, v[2:3] +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i64_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i64 + %fp = uitofp i64 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 67d0410..3324018 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -3,11 +3,11 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; SI-LABEL: is_private_vgpr: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 63333ed..355d002 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -3,11 +3,11 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CIT-LABEL: is_local_vgpr: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index ee11b92..0c1448a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b32_e32 v12, s16 +; GISEL-NEXT: v_mov_b32_e32 v16, s16 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -834,24 +834,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1349,24 +1349,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1513,24 +1513,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1677,24 +1677,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1841,24 +1841,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v12, s2 +; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index af79c91..ac356fa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -6011,8 +6011,7 @@ define half @v_exp_f16_fast(half %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 @@ -6512,10 +6511,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -6709,12 +6707,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index a99c199..d12ebe4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -6092,8 +6092,7 @@ define half @v_exp10_f16_fast(half %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 @@ -6594,10 +6593,9 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -6791,12 +6789,11 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 3f66c23..259ee0b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -488,13 +488,11 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -582,15 +580,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) -; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 21e6faf4..ba77552 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -313,13 +313,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -1009,28 +1007,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> @@ -1225,25 +1221,23 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3 -; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] @@ -1441,30 +1435,28 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5 -; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5 -; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -1622,16 +1614,14 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1790,17 +1780,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 4f73e8e..c90b2c9 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -271,8 +271,7 @@ define half @v_maximumnum_f16_1.0(half %x) { ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 0af655df..4bb6538 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -2399,8 +2399,9 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2409,7 +2410,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_add_i32 s1, s1, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s1, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2 @@ -2468,8 +2469,9 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2478,7 +2480,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_add_i32 s1, s1, -1 ; GFX942-NEXT: s_cmp_lg_u32 s1, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB9_2 diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 558006d..64e8b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -271,8 +271,7 @@ define half @v_minimumnum_f16_1.0(half %x) { ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir index 8d5b5e4..b41aa08 100644 --- a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir +++ b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir @@ -510,14 +510,14 @@ body: | ; RPU-NEXT: 0 0 $sgpr0 = S_BUFFER_LOAD_DWORD_IMM $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0 ; RPU-NEXT: 0 0 ; RPU-NEXT: 0 1 undef %0.sub5:vreg_512 = V_MOV_B32_e32 5, implicit $exec - ; RPU-NEXT: 0 0 - ; RPU-NEXT: 0 0 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc - ; RPU-NEXT: 0 0 - ; RPU-NEXT: 0 0 S_CBRANCH_SCC1 %bb.2, implicit $scc - ; RPU-NEXT: 0 0 - ; RPU-NEXT: 0 0 S_BRANCH %bb.1 - ; RPU-NEXT: 0 0 - ; RPU-NEXT: Live-out: + ; RPU-NEXT: 0 1 + ; RPU-NEXT: 0 1 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc + ; RPU-NEXT: 0 1 + ; RPU-NEXT: 0 1 S_CBRANCH_SCC1 %bb.2, implicit $scc + ; RPU-NEXT: 0 1 + ; RPU-NEXT: 0 1 S_BRANCH %bb.1 + ; RPU-NEXT: 0 1 + ; RPU-NEXT: Live-out: %0:0000000000000C00 ; RPU-NEXT: Live-thr: ; RPU-NEXT: 0 0 ; RPU-NEXT: bb.1: @@ -571,8 +571,6 @@ body: | ; RPD-NEXT: 0 1 S_BRANCH %bb.1 ; RPD-NEXT: 0 1 ; RPD-NEXT: Live-out: %0:0000000000000C00 - ; RPD-NEXT: mis LIS: - ; RPD-NEXT: %0:L0000000000000C00 isn't found in LIS reported set ; RPD-NEXT: Live-thr: ; RPD-NEXT: 0 0 ; RPD-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll new file mode 100644 index 0000000..f53aaaa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -0,0 +1,625 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s + +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64) +declare i32 @llvm.amdgcn.s.quadmask.i32(i32) +declare i64 @llvm.amdgcn.s.quadmask.i64(i64) + +define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: shl32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = shl i32 %val0, %val1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: shl64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = shl i64 %val0, %val1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: lshr32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshr_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = lshr i32 %val0, %val1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: lshr64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = lshr i64 %val0, %val1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: ashr32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_ashr_i32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = ashr i32 %val0, %val1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: ashr64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = ashr i64 %val0, %val1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @abs32(i32 inreg %val0) { +; CHECK-LABEL: abs32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_abs_i32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %neg = sub i32 0, %val0 + %cond = icmp sgt i32 %val0, %neg + %result = select i1 %cond, i32 %val0, i32 %neg + call void asm "; use $0", "s"(i32 %result) + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: and32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_and_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = and i32 %val0, %val1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: and64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = and i64 %val0, %val1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: or32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_or_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = or i32 %val0, %val1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: or64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = or i64 %val0, %val1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: xor32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_xor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = xor i32 %val0, %val1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: xor64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = xor i64 %val0, %val1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: nand32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_nand_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = and i32 %val0, %val1 + %result2 = xor i32 %result, -1 + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: nand64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = and i64 %val0, %val1 + %result2 = xor i64 %result, -1 + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: nor32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_nor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = or i32 %val0, %val1 + %result2 = xor i32 %result, -1 + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: nor64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = or i64 %val0, %val1 + %result2 = xor i64 %result, -1 + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: xnor32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_xnor_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = xor i32 %val0, %val1 + %result2 = xor i32 %result, -1 + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: xnor64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = xor i64 %val0, %val1 + %result2 = xor i64 %result, -1 + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: andn232: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_andn2_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %nval1 = xor i32 %val1, -1 + %result = and i32 %val0, %nval1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: nandn264: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %nval1 = xor i64 %val1, -1 + %result = and i64 %val0, %nval1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: orn232: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_orn2_b32 s0, s0, s1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %nval1 = xor i32 %val1, -1 + %result = or i32 %val0, %nval1 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: orn264: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %nval1 = xor i64 %val1, -1 + %result = or i64 %val0, %nval1 + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) { +; CHECK-LABEL: bfe_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %shl = shl i32 %val0, 8 + %result = ashr i32 %shl, 24 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bfe_i64(i64 inreg %val0) { +; CHECK-LABEL: bfe_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x80000 +; CHECK-NEXT: s_and_b32 s0, s0, 0xff +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; return to shader part epilog + %shl = shl i64 %val0, 56 + %result = ashr i64 %shl, 56 + call void asm "; use $0", "s"(i64 %result) + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) { +; CHECK-LABEL: bfe_u32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %shl = shl i32 %val0, 8 + %result = lshr i32 %shl, 24 + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { +; CHECK-LABEL: bfe_u64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_and_b32 s0, s0, 0xff +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %shl = shl i64 %val0, 56 + %result = lshr i64 %shl, 56 + call void asm "; use $0", "s"(i64 %result) + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { +; CHECK-LABEL: bcnt032: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 +; CHECK-NEXT: s_sub_i32 s0, 32, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %result2 = sub i32 32, %result + call void asm "; use $0", "s"(i32 %result2) + %cmp = icmp ne i32 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { +; CHECK-LABEL: bcnt064: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_sub_u32 s0, 64, s0 +; CHECK-NEXT: s_subb_u32 s1, 0, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + %result2 = sub i64 64, %result + call void asm "; use $0", "s"(i64 %result2) + %cmp = icmp ne i64 %result2, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt132(i32 inreg %val0) { +; CHECK-LABEL: bcnt132: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + call void asm "; use $0", "s"(i32 %result) + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @bcnt164(i64 inreg %val0) { +; CHECK-LABEL: bcnt164: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone + call void asm "; use $0", "s"(i64 %result) + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @quadmask32(i32 inreg %val0) { +; CHECK-LABEL: quadmask32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_quadmask_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val0) nounwind readnone + call void asm "; use $0", "s"(i32 %result) + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @quadmask64(i64 inreg %val0) { +; CHECK-LABEL: quadmask64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val0) nounwind readnone + call void asm "; use $0", "s"(i64 %result) + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @not32(i32 inreg %val0) { +; CHECK-LABEL: not32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_not_b32 s0, s0 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = xor i32 %val0, -1 + call void asm "; use $0", "s"(i32 %result) + %cmp = icmp ne i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +define amdgpu_ps i32 @not64(i64 inreg %val0) { +; CHECK-LABEL: not64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_not_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = xor i64 %val0, -1 + call void asm "; use $0", "s"(i64 %result) + %cmp = icmp ne i64 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index f001bf0..b52913f 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -20,34 +20,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s4, s3 +; GFX90A-NEXT: s_cselect_b32 s7, s3, s2 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 +; GFX90A-NEXT: s_cselect_b32 s7, s4, s7 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s6, s7 +; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 ; GFX90A-NEXT: s_or_b32 s7, s7, s0 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 ; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 +; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 ; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX90A-NEXT: s_cselect_b32 s6, s7, s6 +; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 ; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 +; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 ; GFX90A-NEXT: s_cmp_eq_u32 s1, 0 -; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 +; GFX90A-NEXT: s_cselect_b32 s2, s7, s2 ; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] ; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] ; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s2, 0, s2 +; GFX90A-NEXT: s_cselect_b32 s6, 0, s6 ; GFX90A-NEXT: s_mov_b64 vcc, vcc ; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -68,34 +68,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX942-NEXT: s_cmp_eq_u32 s1, 1 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s4, s3 +; GFX942-NEXT: s_cselect_b32 s7, s3, s2 ; GFX942-NEXT: s_cmp_eq_u32 s1, 2 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s5, s7 +; GFX942-NEXT: s_cselect_b32 s7, s4, s7 ; GFX942-NEXT: s_cmp_eq_u32 s1, 3 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s6, s7 +; GFX942-NEXT: s_cselect_b32 s7, s5, s7 ; GFX942-NEXT: s_or_b32 s7, s7, s0 ; GFX942-NEXT: s_cmp_eq_u32 s1, 1 ; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s4, s7, s4 +; GFX942-NEXT: s_cselect_b32 s3, s7, s3 ; GFX942-NEXT: s_cmp_eq_u32 s1, 3 ; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 ; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX942-NEXT: s_cselect_b32 s6, s7, s6 +; GFX942-NEXT: s_cselect_b32 s5, s7, s5 ; GFX942-NEXT: s_cmp_eq_u32 s1, 2 ; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX942-NEXT: s_cselect_b32 s5, s7, s5 +; GFX942-NEXT: s_cselect_b32 s4, s7, s4 ; GFX942-NEXT: s_cmp_eq_u32 s1, 0 -; GFX942-NEXT: s_cselect_b32 s3, s7, s3 +; GFX942-NEXT: s_cselect_b32 s2, s7, s2 ; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] ; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] ; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s2, 0, s2 +; GFX942-NEXT: s_cselect_b32 s6, 0, s6 ; GFX942-NEXT: s_mov_b64 vcc, vcc ; GFX942-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -117,34 +117,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 ; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1030-NEXT: s_cselect_b32 s7, s3, s2 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1030-NEXT: s_cselect_b32 s7, s4, s7 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s6, s7 +; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 ; GFX1030-NEXT: s_or_b32 s7, s7, s0 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 ; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1030-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo -; GFX1030-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1030-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo -; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 -; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 +; GFX1030-NEXT: s_cselect_b32 s2, s7, s2 ; GFX1030-NEXT: s_or_b32 s7, s10, s8 ; GFX1030-NEXT: s_or_b32 s7, s9, s7 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1030-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1030-NEXT: s_cselect_b32 s6, 0, s6 ; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock ; GFX1030-NEXT: s_endpgm @@ -166,38 +166,38 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1100-NEXT: s_cselect_b32 s7, s3, s2 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1100-NEXT: s_cselect_b32 s7, s4, s7 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s6, s7 +; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 ; GFX1100-NEXT: s_or_b32 s7, s7, s0 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 ; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 ; GFX1100-NEXT: s_cselect_b32 s9, -1, 0 ; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo -; GFX1100-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1100-NEXT: s_cselect_b32 s10, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo -; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 +; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 -; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 +; GFX1100-NEXT: s_cselect_b32 s2, s7, s2 ; GFX1100-NEXT: s_or_b32 s7, s10, s8 ; GFX1100-NEXT: s_or_b32 s7, s9, s7 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1100-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1100-NEXT: s_cselect_b32 s6, 0, s6 ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock ; GFX1100-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 8b467eb..75ae76f 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -1,7 +1,7 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 # RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX950 %s # RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX942 %s -# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90a %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90A %s --- name: test_pk_mul_unpacking_f32 @@ -57,26 +57,26 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_pk_mul_unpacking_f32 - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -150,26 +150,26 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_op_sel_selection_unpacking_f32 - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -243,26 +243,26 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_op_sel_hi_selection_unpacking_f32 - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -370,41 +370,41 @@ body: | ; GFX942-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_pk_add_unpacking_f32 - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec - ; GFX90a-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; GFX90a-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec - ; GFX90a-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; GFX90a-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90a-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec - ; GFX90a-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; GFX90a-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_pk_add_unpacking_f32 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec @@ -438,7 +438,6 @@ body: | renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 - ... --- name: test_pk_fma_unpacking_f32 @@ -490,24 +489,24 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_pk_fma_unpacking_f32 - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -577,25 +576,25 @@ body: | ; GFX942-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_unpacking_does_not_introduce_rw_dependency - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -670,28 +669,28 @@ body: | ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -705,14 +704,11 @@ body: | early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec - S_ENDPGM 0 ... @@ -770,26 +766,26 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_opsel_register_is_correctly_marked_as_killed - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -861,26 +857,26 @@ body: | ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec - ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec - ; GFX90a-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -950,25 +946,25 @@ body: | ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_mfma_def_using_instr_blocks_unpacking - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -1041,26 +1037,26 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_unpacking_with_imm_input - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_unpacking_with_imm_input + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -1134,26 +1130,26 @@ body: | ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GFX942-NEXT: S_ENDPGM 0 ; - ; GFX90a-LABEL: name: test_neg_lo_hi_post_unpacking - ; GFX90a: liveins: $sgpr4_sgpr5 - ; GFX90a-NEXT: {{ $}} - ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GFX90a-NEXT: S_WAITCNT 49279 - ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX90a-NEXT: S_ENDPGM 0 + ; GFX90A-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX90A: liveins: $sgpr4_sgpr5 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90A-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90A-NEXT: S_WAITCNT 49279 + ; GFX90A-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90A-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90A-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 6b5bae0..c9b94e0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -6,12 +6,12 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s ; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index b5d9d00..8d0e003 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; FIXME-TRUE16. enable gisel -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i8: @@ -1632,6 +1632,7 @@ entry: ret i8 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1678,7 +1679,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX9-GISEL-LABEL: test_vector_reduce_smax_v2i16: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,7 +1693,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_smax_v2i16: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,7 +1714,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v2i16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1747,7 +1748,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1900,6 +1901,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1961,7 +1963,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,7 +1979,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) { ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2003,7 +2005,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2041,7 +2043,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -2049,6 +2051,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -2139,7 +2142,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2159,7 +2162,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2192,7 +2195,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2238,7 +2241,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2247,6 +2250,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -2391,7 +2395,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2419,7 +2423,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2467,7 +2471,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2528,7 +2532,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index 2a989ec..f15ecf0 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; FIXME-TRUE16. enable gisel -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i8: @@ -1632,6 +1632,7 @@ entry: ret i8 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1678,7 +1679,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX9-GISEL-LABEL: test_vector_reduce_smin_v2i16: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1692,7 +1693,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_smin_v2i16: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1713,7 +1714,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v2i16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1747,7 +1748,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1900,6 +1901,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1961,7 +1963,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1977,7 +1979,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) { ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2003,7 +2005,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2041,7 +2043,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -2049,6 +2051,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -2139,7 +2142,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2159,7 +2162,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2192,7 +2195,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2238,7 +2241,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2391,7 +2394,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2419,7 +2422,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2467,7 +2470,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2528,7 +2531,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 69fd58a..e62165c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; FIXME-TRUE16. enable gisel -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s define i8 @test_vector_reduce_umax_v2i8(<2 x i8> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i8: @@ -1525,6 +1525,7 @@ entry: ret i8 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1569,7 +1570,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX9-GISEL-LABEL: test_vector_reduce_umax_v2i16: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1583,7 +1584,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_umax_v2i16: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1604,7 +1605,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v2i16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1638,7 +1639,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1782,6 +1783,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1841,7 +1843,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1857,7 +1859,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) { ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1883,7 +1885,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1921,7 +1923,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -1929,6 +1931,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -2017,7 +2020,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2037,7 +2040,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_max_u16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2070,7 +2073,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_max_u16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2116,7 +2119,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_max_u16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2125,6 +2128,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -2267,7 +2271,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2295,7 +2299,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_max_u16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2343,7 +2347,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_max_u16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2404,7 +2408,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_max_u16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 1d3b42e..83ecaaa 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; FIXME-TRUE16. enable gisel -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s define i8 @test_vector_reduce_umin_v2i8(<2 x i8> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i8: @@ -1271,6 +1271,7 @@ entry: ret i8 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1312,7 +1313,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) { ; GFX9-GISEL-LABEL: test_vector_reduce_umin_v2i16: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1326,7 +1327,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) { ; GFX10-GISEL-LABEL: test_vector_reduce_umin_v2i16: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1347,7 +1348,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) { ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v2i16: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1381,7 +1382,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1527,6 +1528,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v4i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1583,7 +1585,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1599,7 +1601,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) { ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1625,7 +1627,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1663,7 +1665,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) { ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -1671,6 +1673,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v8i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -1756,7 +1759,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1776,7 +1779,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -1809,7 +1812,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1855,7 +1858,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1864,6 +1867,7 @@ entry: ret i16 %res } +; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY. define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) { ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v16i16: ; GFX7-SDAG: ; %bb.0: ; %entry @@ -2003,7 +2007,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) { ; GFX9-GISEL-NEXT: s_nop 0 ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_nop 0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2031,7 +2035,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) { ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16 ; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -2079,7 +2083,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) { ; GFX11-GISEL-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2140,7 +2144,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) { ; GFX12-GISEL-NEXT: v_pk_min_u16 v1, v1, v3 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll new file mode 100644 index 0000000..a2d6ca9 --- /dev/null +++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll @@ -0,0 +1,39 @@ +;; Test if a potential indirect call target function which has internal linkage and +;; address taken has its type ID emitted to callgraph section. +;; This test also makes sure that callback functions which meet the above constraint +;; are handled correctly. + +; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s + +declare !type !0 void @_Z6doWorkPFviE(ptr) + +define i32 @_Z4testv() !type !1 { +entry: + call void @_Z6doWorkPFviE(ptr nonnull @_ZL10myCallbacki) + ret i32 0 +} + +; CHECK: _ZL10myCallbacki: +; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: +define internal void @_ZL10myCallbacki(i32 %value) !type !2 { +entry: + %sink = alloca i32, align 4 + store volatile i32 %value, ptr %sink, align 4 + %i1 = load volatile i32, ptr %sink, align 4 + ret void +} + +!0 = !{i64 0, !"_ZTSFvPFviEE.generalized"} +!1 = !{i64 0, !"_ZTSFivE.generalized"} +!2 = !{i64 0, !"_ZTSFviE.generalized"} + +; CHECK: .section .callgraph,"o",%progbits,.text +;; Version +; CHECK-NEXT: .byte 0 +;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. +; CHECK-NEXT: .byte 1 +;; Function Entry PC +; CHECK-NEXT: .long [[LABEL_FUNC]] +;; Function type ID -5212364466660467813 +; CHECK-NEXT: .long 1154849691 +; CHECK-NEXT: .long 3081369122 diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll new file mode 100644 index 0000000..bf5249e --- /dev/null +++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll @@ -0,0 +1,63 @@ +;; Test if temporary labels are generated for each indirect callsite. +;; Test if the .callgraph section contains the MD5 hash of callees' type (type id) +;; is correctly paired with its corresponding temporary label generated for indirect +;; call sites annotated with !callee_type metadata. +;; Test if the .callgraph section contains unique direct callees. + +; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s + +declare !type !0 void @direct_foo() +declare !type !1 i32 @direct_bar(i8) +declare !type !2 ptr @direct_baz(ptr) + +; CHECK: ball: +; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: +define ptr @ball() { +entry: + call void @direct_foo() + %fp_foo_val = load ptr, ptr null, align 8 + call void (...) %fp_foo_val(), !callee_type !0 + call void @direct_foo() + %fp_bar_val = load ptr, ptr null, align 8 + %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2 + %call_fp_bar_direct = call i32 @direct_bar(i8 1) + %fp_baz_val = load ptr, ptr null, align 8 + %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4 + call void @direct_foo() + %call_fp_baz_direct = call ptr @direct_baz(ptr null) + call void @direct_foo() + ret ptr %call_fp_baz +} + +!0 = !{!1} +!1 = !{i64 0, !"_ZTSFvE.generalized"} +!2 = !{!3} +!3 = !{i64 0, !"_ZTSFicE.generalized"} +!4 = !{!5} +!5 = !{i64 0, !"_ZTSFPvS_E.generalized"} + +; CHECK: .section .callgraph,"o",%progbits,.text +;; Version +; CHECK-NEXT: .byte 0 +;; Flags +; CHECK-NEXT: .byte 7 +;; Function Entry PC +; CHECK-NEXT: .long [[LABEL_FUNC]] +;; Function type ID -- set to 0 as no type metadata attached to function. +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 0 +;; Number of unique direct callees. +; CHECK-NEXT: .byte 3 +;; Direct callees. +; CHECK-NEXT: .long direct_foo +; CHECK-NEXT: .long direct_bar +; CHECK-NEXT: .long direct_baz +;; Number of unique indirect target type IDs. +; CHECK-NEXT: .byte 3 +;; Indirect type IDs. +; CHECK-NEXT: .long 838288420 +; CHECK-NEXT: .long 1053552373 +; CHECK-NEXT: .long 1505527380 +; CHECK-NEXT: .long 814631809 +; CHECK-NEXT: .long 342417018 +; CHECK-NEXT: .long 2013108216 diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll new file mode 100644 index 0000000..d577603 --- /dev/null +++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll @@ -0,0 +1,34 @@ +;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls. + +; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ +; RUN: llvm-readelf -x .callgraph - | FileCheck %s + +define i32 @check_tailcall(ptr %func, i8 %x) !type !0 { +entry: + %call = tail call i32 %func(i8 signext %x), !callee_type !1 + ret i32 %call +} + +define i32 @main(i32 %argc) !type !3 { +entry: + %andop = and i32 %argc, 1 + %cmp = icmp eq i32 %andop, 0 + %foo.bar = select i1 %cmp, ptr @foo, ptr @bar + %call.i = tail call i32 %foo.bar(i8 signext 97), !callee_type !1 + ret i32 %call.i +} + +declare !type !2 i32 @foo(i8 signext) + +declare !type !2 i32 @bar(i8 signext) + +!0 = !{i64 0, !"_ZTSFiPvcE.generalized"} +!1 = !{!2} +!2 = !{i64 0, !"_ZTSFicE.generalized"} +!3 = !{i64 0, !"_ZTSFiiE.generalized"} + +; CHECK: Hex dump of section '.callgraph': +; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154 +; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8 +;; Verify that the type id 0x308e4b8159bc8654 is in section. +; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30 diff --git a/llvm/test/CodeGen/ARM/call-graph-section.ll b/llvm/test/CodeGen/ARM/call-graph-section.ll new file mode 100644 index 0000000..928a1067 --- /dev/null +++ b/llvm/test/CodeGen/ARM/call-graph-section.ll @@ -0,0 +1,37 @@ +;; Tests that we store the type identifiers in .callgraph section of the object file. + +; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \ +; RUN: llvm-readelf -x .callgraph - | FileCheck %s + +declare !type !0 void @foo() + +declare !type !1 i32 @bar(i8) + +declare !type !2 ptr @baz(ptr) + +define void @main() { +entry: + %fp_foo_val = load ptr, ptr null, align 8 + call void (...) %fp_foo_val(), !callee_type !1 + %fp_bar_val = load ptr, ptr null, align 8 + %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !3 + %fp_baz_val = load ptr, ptr null, align 8 + %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4 + ret void +} + +;; Check that the numeric type id (md5 hash) for the below type ids are emitted +;; to the callgraph section. +!0 = !{i64 0, !"_ZTSFvE.generalized"} +!1 = !{!0} +!2 = !{i64 0, !"_ZTSFicE.generalized"} +!3 = !{!2} +!4 = !{!5} +!5 = !{i64 0, !"_ZTSFPvS_E.generalized"} + +;; Make sure following type IDs are in call graph section +;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814 +; CHECK: Hex dump of section '.callgraph': +; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000324 +; CHECK-NEXT: 0x00000010 44f731f5 eecb3e54 86bc5981 4b8e307a +; CHECK-NEXT: 0x00000020 de6814f8 97fd77 diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll index 800ee87..8230e47 100644 --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -1572,26 +1572,11 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 { } define void @test_fabs(ptr %p) { -; CHECK-FP16-LABEL: test_fabs: -; CHECK-FP16: ldrh r1, [r0] -; CHECK-FP16-NEXT: vmov s0, r1 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FP16-NEXT: vabs.f32 s0, s0 -; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FP16-NEXT: vmov r1, s0 -; CHECK-FP16-NEXT: strh r1, [r0] -; CHECK-FP16-NEXT: bx lr -; -; CHECK-LIBCALL-LABEL: test_fabs: -; CHECK-LIBCALL: .save {r4, lr} -; CHECK-LIBCALL-NEXT: push {r4, lr} -; CHECK-LIBCALL-NEXT: mov r4, r0 -; CHECK-LIBCALL-NEXT: ldrh r0, [r0] -; CHECK-LIBCALL-NEXT: bl __aeabi_h2f -; CHECK-LIBCALL-NEXT: bic r0, r0, #-2147483648 -; CHECK-LIBCALL-NEXT: bl __aeabi_f2h -; CHECK-LIBCALL-NEXT: strh r0, [r4] -; CHECK-LIBCALL-NEXT: pop {r4, pc} +; CHECK-ALL-LABEL: test_fabs: +; CHECK-ALL: ldrh r1, [r0] +; CHECK-ALL-NEXT: bfc r1, #15, #17 +; CHECK-ALL-NEXT: strh r1, [r0] +; CHECK-ALL-NEXT: bx lr %a = load half, ptr %p, align 2 %r = call half @llvm.fabs.f16(half %a) store half %r, ptr %p @@ -2454,26 +2439,11 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { } define void @test_fneg(ptr %p1, ptr %p2) #0 { -; CHECK-FP16-LABEL: test_fneg: -; CHECK-FP16: ldrh r0, [r0] -; CHECK-FP16-NEXT: vmov s0, r0 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FP16-NEXT: vneg.f32 s0, s0 -; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FP16-NEXT: vmov r0, s0 -; CHECK-FP16-NEXT: strh r0, [r1] -; CHECK-FP16-NEXT: bx lr -; -; CHECK-LIBCALL-LABEL: test_fneg: -; CHECK-LIBCALL: .save {r4, lr} -; CHECK-LIBCALL-NEXT: push {r4, lr} -; CHECK-LIBCALL-NEXT: ldrh r0, [r0] -; CHECK-LIBCALL-NEXT: mov r4, r1 -; CHECK-LIBCALL-NEXT: bl __aeabi_h2f -; CHECK-LIBCALL-NEXT: eor r0, r0, #-2147483648 -; CHECK-LIBCALL-NEXT: bl __aeabi_f2h -; CHECK-LIBCALL-NEXT: strh r0, [r4] -; CHECK-LIBCALL-NEXT: pop {r4, pc} +; CHECK-ALL-LABEL: test_fneg: +; CHECK-ALL: ldrh r0, [r0] +; CHECK-ALL-NEXT: eor r0, r0, #32768 +; CHECK-ALL-NEXT: strh r0, [r1] +; CHECK-ALL-NEXT: bx lr %v = load half, ptr %p1, align 2 %res = fneg half %v store half %res, ptr %p2, align 2 diff --git a/llvm/test/CodeGen/ARM/sincos.ll b/llvm/test/CodeGen/ARM/sincos.ll index e1b683a..1a4313e 100644 --- a/llvm/test/CodeGen/ARM/sincos.ll +++ b/llvm/test/CodeGen/ARM/sincos.ll @@ -2,8 +2,7 @@ ; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS ; RUN: llc < %s -mtriple=armv7-linux-gnu -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU -; RUN: llc < %s -mtriple=armv7-linux-android -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT-ANDROID -; RUN: llc < %s -mtriple=armv7-linux-android9 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU +; RUN: llc < %s -mtriple=armv7-linux-android -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU ; Combine sin / cos into a single call unless they may write errno (as ; captured by readnone attrbiute, controlled by clang -fmath-errno @@ -22,10 +21,6 @@ entry: ; NOOPT: bl _sinf ; NOOPT: bl _cosf -; NOOPT-ANDROID-LABEL: test1: -; NOOPT-ANDROID: bl sinf -; NOOPT-ANDROID: bl cosf - %call = tail call float @sinf(float %x) readnone %call1 = tail call float @cosf(float %x) readnone %add = fadd float %call, %call1 @@ -44,10 +39,6 @@ entry: ; NOOPT: bl _sinf ; NOOPT: bl _cosf -; NOOPT-ANDROID-LABEL: test1_fast: -; NOOPT-ANDROID: bl sinf -; NOOPT-ANDROID: bl cosf - %call = tail call fast float @sinf(float %x) readnone %call1 = tail call fast float @cosf(float %x) readnone %add = fadd float %call, %call1 @@ -68,10 +59,6 @@ entry: ; NOOPT: bl _sinf ; NOOPT: bl _cosf -; NOOPT-ANDROID-LABEL: test1_errno: -; NOOPT-ANDROID: bl sinf -; NOOPT-ANDROID: bl cosf - %call = tail call float @sinf(float %x) %call1 = tail call float @cosf(float %x) %add = fadd float %call, %call1 @@ -90,10 +77,6 @@ entry: ; NOOPT: bl _sin ; NOOPT: bl _cos -; NOOPT-ANDROID-LABEL: test2: -; NOOPT-ANDROID: bl sin -; NOOPT-ANDROID: bl cos - %call = tail call double @sin(double %x) readnone %call1 = tail call double @cos(double %x) readnone %add = fadd double %call, %call1 @@ -112,10 +95,6 @@ entry: ; NOOPT: bl _sin ; NOOPT: bl _cos -; NOOPT-ANDROID-LABEL: test2_fast: -; NOOPT-ANDROID: bl sin -; NOOPT-ANDROID: bl cos - %call = tail call fast double @sin(double %x) readnone %call1 = tail call fast double @cos(double %x) readnone %add = fadd double %call, %call1 @@ -136,10 +115,6 @@ entry: ; NOOPT: bl _sin ; NOOPT: bl _cos -; NOOPT-ANDROID-LABEL: test2_errno: -; NOOPT-ANDROID: bl sin -; NOOPT-ANDROID: bl cos - %call = tail call double @sin(double %x) %call1 = tail call double @cos(double %x) %add = fadd double %call, %call1 diff --git a/llvm/test/CodeGen/Generic/bfloat-op.ll b/llvm/test/CodeGen/Generic/bfloat-op.ll new file mode 100644 index 0000000..d593328 --- /dev/null +++ b/llvm/test/CodeGen/Generic/bfloat-op.ll @@ -0,0 +1,104 @@ +; Same as `bfloat.ll`, but for `fneg`, `fabs`, `copysign` and `fma`. +; Can be merged back into `bfloat.ll` once they have the same platform coverage. +; Once all targets are fixed, the `CHECK-*` prefixes should all be merged into a single `CHECK` prefix and the `BAD-*` prefixes should be removed. + +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-apple-darwin | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: BPF has a compiler error +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: hard float csky crashes +; FIXME: directx has a compiler error +; FIXME: hexagon crashes +; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: mips crashes +; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} +; FIXME: powerpc crashes +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: sparc crashes +; FIXME: spirv crashes +; FIXME: s390x crashes +; FIXME: ve crashes +; FIXME: wasm crashes +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-COPYSIGN,CHECK-FMA %} + +; Note that arm64ec labels are quoted, hence the `{{"?}}:`. + +; Codegen tests don't work the same for graphics targets. Add a dummy directive +; for filecheck, just make sure we don't crash. +; NOCRASH: {{.*}} + +; fneg, fabs and copysign all need to not quieten signalling NaNs, so should not call any conversion functions which do. +; These tests won't catch cases where the everything is done using native instructions instead of builtins. + +define void @test_fneg(ptr %p1, ptr %p2) #0 { +; ALL-LABEL: test_fneg{{"?}}: +; ALL-NEG-NOT: __extend +; ALL-NEG-NOT: __trunc +; ALL-NEG-NOT: __gnu +; ALL-NEG-NOT: __aeabi + %v = load bfloat, ptr %p1 + %res = fneg bfloat %v + store bfloat %res, ptr %p2 + ret void +} + +define void @test_fabs(ptr %p1, ptr %p2) { +; ALL-LABEL: test_fabs{{"?}}: +; ALL-ABS-NOT: __extend +; ALL-ABS-NOT: __trunc +; ALL-ABS-NOT: __gnu +; ALL-ABS-NOT: __aeabi + %a = load bfloat, ptr %p1 + %r = call bfloat @llvm.fabs.f16(bfloat %a) + store bfloat %r, ptr %p2 + ret void +} + +define void @test_copysign(ptr %p1, ptr %p2, ptr %p3) { +; ALL-LABEL: test_copysign{{"?}}: +; CHECK-COPYSIGN-NOT: __extend +; CHECK-COPYSIGN-NOT: __trunc +; CHECK-COPYSIGN-NOT: __gnu +; CHECK-COPYSIGN-NOT: __aeabi +; BAD-COPYSIGN: __truncsfbf2 + %a = load bfloat, ptr %p1 + %b = load bfloat, ptr %p2 + %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) + store bfloat %r, ptr %p3 + ret void +} + +; There is no floating-point type LLVM supports that is large enough to promote bfloat FMA to +; without causing double rounding issues. This checks for libcalls to f32/f64 fma and truncating +; f32/f64 to bf16. See https://github.com/llvm/llvm-project/issues/131531 + +define void @test_fma(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { +; ALL-LABEL: test_fma{{"?}}: +; CHECK-FMA-NOT: {{\bfmaf?\b}} +; CHECK-FMA-NOT: __truncsfbf2 +; CHECK-FMA-NOT: __truncdfbf2 +; BAD-FMA: {{__truncsfbf2|\bfmaf?\b}} + %a = load bfloat, ptr %p1 + %b = load bfloat, ptr %p2 + %c = load bfloat, ptr %p3 + %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) + store bfloat %r, ptr %p4 + ret void +} diff --git a/llvm/test/CodeGen/Generic/bfloat.ll b/llvm/test/CodeGen/Generic/bfloat.ll new file mode 100644 index 0000000..83c6711 --- /dev/null +++ b/llvm/test/CodeGen/Generic/bfloat.ll @@ -0,0 +1,75 @@ +; Simple cross-platform smoke checks for basic bf16 operations. +; +; There shouldn't be any architectures that crash when trying to use `bfloat`; +; check that here. Additionally do a small handful of smoke tests that work +; well cross-platform. + +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-apple-darwin | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: arm64ec crashes when passing/returning bfloat +; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if bpf-registered-target %{ llc %s -o - -mtriple=bpfel | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: hard float csky crashes +; RUN: %if directx-registered-target %{ llc %s -o - -mtriple=dxil-pc-shadermodel6.3-library | FileCheck %s --check-prefixes=NOCRASH %} +; FIXME: hexagon crashes +; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: mips crashes +; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} +; FIXME: powerpc crashes +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: sparc crashes +; FIXME: spirv crashes +; FIXME: s390x crashes +; FIXME: ve crashes +; FIXME: wasm crashes +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,CHECK %} + +; Note that arm64ec labels are quoted, hence the `{{"?}}:`. + +; Codegen tests don't work the same for graphics targets. Add a dummy directive +; for filecheck, just make sure we don't crash. +; NOCRASH: {{.*}} + +; All backends need to be able to bitcast without converting to another format, +; so we assert against libcalls (specifically __truncsfbf2). This won't catch hardware conversions. + +define bfloat @from_bits(i16 %bits) nounwind { +; ALL-LABEL: from_bits{{"?}}: +; ALL-NOT: __extend +; ALL-NOT: __trunc +; ALL-NOT: __gnu + %f = bitcast i16 %bits to bfloat + ret bfloat %f +} + +define i16 @to_bits(bfloat %f) nounwind { +; ALL-LABEL: to_bits{{"?}}: +; CHECK-NOT: __extend +; CHECK-NOT: __trunc +; CHECK-NOT: __gnu +; BAD: __truncsfbf2 + %bits = bitcast bfloat %f to i16 + ret i16 %bits +} + +define bfloat @check_freeze(bfloat %f) nounwind { +; ALL-LABEL: check_freeze{{"?}}: + %t0 = freeze bfloat %f + ret bfloat %t0 +} diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll new file mode 100644 index 0000000..1037d8e --- /dev/null +++ b/llvm/test/CodeGen/Generic/half-op.ll @@ -0,0 +1,115 @@ +; Same as `half.ll`, but for `fneg`, `fabs`, `copysign` and `fma`. +; Can be merged back into `half.ll` once BPF doesn't have a compiler error. +; Once all targets are fixed, the `CHECK-*` prefixes should all be merged into a single `CHECK` prefix and the `BAD-*` prefixes should be removed. + +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-apple-darwin | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: BPF has a compiler error +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; FIXME: directx has a compiler error +; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} +; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %} +; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %} + +; Note that arm64ec labels are quoted, hence the `{{"?}}:`. + +; Codegen tests don't work the same for graphics targets. Add a dummy directive +; for filecheck, just make sure we don't crash. +; NOCRASH: {{.*}} + +; fneg, fabs and copysign all need to not quieten signalling NaNs, so should not call any conversion functions which do. +; These tests won't catch cases where the everything is done using native instructions instead of builtins. +; See https://github.com/llvm/llvm-project/issues/104915 + +define void @test_fneg(ptr %p1, ptr %p2) #0 { +; ALL-LABEL: test_fneg{{"?}}: +; CHECK-NEG-ABS-NOT: __extend +; CHECK-NEG-ABS-NOT: __trunc +; CHECK-NEG-ABS-NOT: __gnu +; CHECK-NEG-ABS-NOT: __aeabi +; BAD-NEG-ABS: {{__extendhfsf2|__gnu_h2f_ieee|__aeabi_h2f}} + %v = load half, ptr %p1 + %res = fneg half %v + store half %res, ptr %p2 + ret void +} + +define void @test_fabs(ptr %p1, ptr %p2) { +; ALL-LABEL: test_fabs{{"?}}: +; CHECK-NEG-ABS-NOT: __extend +; CHECK-NEG-ABS-NOT: __trunc +; CHECK-NEG-ABS-NOT: __gnu +; CHECK-NEG-ABS-NOT: __aeabi +; BAD-NEG-ABS: {{__extendhfsf2|__gnu_h2f_ieee|__aeabi_h2f}} + %a = load half, ptr %p1 + %r = call half @llvm.fabs.f16(half %a) + store half %r, ptr %p2 + ret void +} + +define void @test_copysign(ptr %p1, ptr %p2, ptr %p3) { +; ALL-LABEL: test_copysign{{"?}}: +; CHECK-COPYSIGN-NOT: __extend +; CHECK-COPYSIGN-NOT: __trunc +; CHECK-COPYSIGN-NOT: __gnu +; CHECK-COPYSIGN-NOT: __aeabi +; BAD-COPYSIGN: {{__extendhfsf2|__gnu_h2f_ieee}} + %a = load half, ptr %p1 + %b = load half, ptr %p2 + %r = call half @llvm.copysign.f16(half %a, half %b) + store half %r, ptr %p3 + ret void +} + +; If promoting, fma must promote at least to f64 to avoid double rounding issues. +; This checks for calls to f32 fmaf and truncating f32 to f16. +; See https://github.com/llvm/llvm-project/issues/98389 + +define void @test_fma(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { +; ALL-LABEL: test_fma{{"?}}: +; Allow fmaf16 +; CHECK-FMA-NOT: fmaf{{\b}} +; CHECK-FMA-NOT: __truncsfhf2 +; CHECK-FMA-NOT: __gnu_f2h_ieee +; CHECK-FMA-NOT: __aeabi_f2h +; BAD-FMA: {{__truncsfhf2|__gnu_f2h_ieee|__aeabi_f2h|fmaf\b}} + %a = load half, ptr %p1 + %b = load half, ptr %p2 + %c = load half, ptr %p3 + %r = call half @llvm.fma.f16(half %a, half %b, half %c) + store half %r, ptr %p4 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll index 39f9269..4998d87 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp.ll +++ b/llvm/test/CodeGen/PowerPC/memcmp.ll @@ -6,12 +6,10 @@ define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: ldbrx 3, 0, 3 ; CHECK-NEXT: ldbrx 4, 0, 4 -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: subc 3, 4, 3 -; CHECK-NEXT: subfe 3, 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8) @@ -23,11 +21,11 @@ define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture reado ; CHECK: # %bb.0: ; CHECK-NEXT: lwbrx 3, 0, 3 ; CHECK-NEXT: lwbrx 4, 0, 4 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 +; CHECK-NEXT: extsw 3, 3 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4) ret i32 %call diff --git a/llvm/test/CodeGen/PowerPC/ucmp.ll b/llvm/test/CodeGen/PowerPC/ucmp.ll index d2dff6e..4d393dd 100644 --- a/llvm/test/CodeGen/PowerPC/ucmp.ll +++ b/llvm/test/CodeGen/PowerPC/ucmp.ll @@ -4,12 +4,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { ; CHECK-LABEL: ucmp_8_8: ; CHECK: # %bb.0: -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i8 %x, i8 %y) ret i8 %1 @@ -18,12 +16,10 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind { define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { ; CHECK-LABEL: ucmp_8_16: ; CHECK: # %bb.0: -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: sub 5, 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i16 %x, i16 %y) ret i8 %1 @@ -32,14 +28,10 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind { define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: ucmp_8_32: ; CHECK: # %bb.0: -; CHECK-NEXT: clrldi 5, 4, 32 -; CHECK-NEXT: clrldi 6, 3, 32 -; CHECK-NEXT: sub 5, 5, 6 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i32 %x, i32 %y) ret i8 %1 @@ -48,12 +40,10 @@ define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind { define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: ucmp_8_64: ; CHECK: # %bb.0: -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: subc 3, 4, 3 -; CHECK-NEXT: subfe 3, 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i8 @llvm.ucmp(i64 %x, i64 %y) ret i8 %1 @@ -82,14 +72,10 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind { define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { ; CHECK-LABEL: ucmp_32_32: ; CHECK: # %bb.0: -; CHECK-NEXT: clrldi 5, 4, 32 -; CHECK-NEXT: clrldi 6, 3, 32 -; CHECK-NEXT: sub 5, 5, 6 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: rldic 3, 3, 0, 32 -; CHECK-NEXT: rldicl 5, 5, 1, 63 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i32 @llvm.ucmp(i32 %x, i32 %y) ret i32 %1 @@ -98,12 +84,10 @@ define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind { define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: ucmp_32_64: ; CHECK: # %bb.0: -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: subc 3, 4, 3 -; CHECK-NEXT: subfe 3, 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: neg 3, 3 -; CHECK-NEXT: isellt 3, 4, 3 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i32 @llvm.ucmp(i64 %x, i64 %y) ret i32 %1 @@ -112,12 +96,10 @@ define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind { define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind { ; CHECK-LABEL: ucmp_64_64: ; CHECK: # %bb.0: -; CHECK-NEXT: subc 5, 4, 3 -; CHECK-NEXT: cmpld 3, 4 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: subfe 5, 4, 4 -; CHECK-NEXT: neg 5, 5 -; CHECK-NEXT: isellt 3, 3, 5 +; CHECK-NEXT: subc 6, 4, 3 +; CHECK-NEXT: sub 5, 3, 4 +; CHECK-NEXT: subfe 3, 4, 3 +; CHECK-NEXT: subfe 3, 3, 5 ; CHECK-NEXT: blr %1 = call i64 @llvm.ucmp(i64 %x, i64 %y) ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll index edec1d0..1957019 100644 --- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll @@ -201,8 +201,9 @@ define signext i32 @test4(i32 signext %x, i32 signext %y, i32 signext %z) { ; ; RV32IXQCI-LABEL: test4: ; RV32IXQCI: # %bb.0: -; RV32IXQCI-NEXT: li a0, 0 -; RV32IXQCI-NEXT: qc.lieqi a0, a2, 0, 3 +; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: li a1, 3 +; RV32IXQCI-NEXT: qc.selectieqi a0, 0, a1, 0 ; RV32IXQCI-NEXT: ret %c = icmp eq i32 %z, 0 %a = select i1 %c, i32 3, i32 0 diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index 2ebb6e9..d089e36 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -514,6 +514,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32I-NEXT: lui a1, 16 ; RV32I-NEXT: addi s1, a1, -1 ; RV32I-NEXT: and a0, a0, s1 @@ -521,13 +522,12 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s2, a0, a1 ; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui a0, 524288 -; RV32I-NEXT: xor a0, s0, a0 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, s2, s1 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: mv a0, s0 @@ -536,6 +536,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; @@ -545,6 +546,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: lui a1, 16 ; RV64I-NEXT: addi s1, a1, -1 ; RV64I-NEXT: and a0, a0, s1 @@ -552,13 +554,12 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s2, a0, a1 ; RV64I-NEXT: and a0, a0, s1 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lui a0, 524288 -; RV64I-NEXT: xor a0, s0, a0 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s1 +; RV64I-NEXT: and a0, s2, s1 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s0 @@ -567,6 +568,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; @@ -638,11 +640,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: lui a1, 1048568 ; RV32I-NEXT: slli s1, s1, 17 ; RV32I-NEXT: and a0, a0, a1 @@ -677,11 +675,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: lui a1, 1048568 ; RV64I-NEXT: slli s1, s1, 49 ; RV64I-NEXT: and a0, a0, a1 @@ -804,15 +798,14 @@ define half @fabs_h(half %a, half %b) nounwind { ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: slli s0, a0, 17 +; RV32I-NEXT: srli s0, s0, 17 ; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -841,15 +834,14 @@ define half @fabs_h(half %a, half %b) nounwind { ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: slli s0, a0, 49 +; RV64I-NEXT: srli s0, s0, 49 ; RV64I-NEXT: and a0, a0, s2 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s2 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -1217,25 +1209,21 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and a0, a2, s3 +; RV32I-NEXT: addi s2, a0, -1 +; RV32I-NEXT: and a0, a2, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s3, a0, a1 +; RV32I-NEXT: and a0, s1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s2, s3 +; RV32I-NEXT: and a0, s3, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: mv a0, s1 @@ -1261,25 +1249,21 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 16 -; RV64I-NEXT: addi s3, a0, -1 -; RV64I-NEXT: and a0, a2, s3 +; RV64I-NEXT: addi s2, a0, -1 +; RV64I-NEXT: and a0, a2, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s3, a0, a1 +; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 +; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: and a0, s2, s3 +; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: mv a0, s1 @@ -1355,43 +1339,34 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a2 -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: lui s3, 16 -; RV32I-NEXT: addi s3, s3, -1 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: lui a1, 16 +; RV32I-NEXT: addi s3, a1, -1 ; RV32I-NEXT: and a0, a0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s2, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui s4, 524288 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s2, s2, a1 +; RV32I-NEXT: xor s4, a0, a1 ; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 -; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: and a0, s2, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call fmaf ; RV32I-NEXT: call __truncsfhf2 @@ -1413,43 +1388,34 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s1, a2 -; RV64I-NEXT: mv s0, a1 -; RV64I-NEXT: lui s3, 16 -; RV64I-NEXT: addi s3, s3, -1 +; RV64I-NEXT: mv s0, a2 +; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: lui a1, 16 +; RV64I-NEXT: addi s3, a1, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s2, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui s4, 524288 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s2, s2, a1 +; RV64I-NEXT: xor s4, a0, a1 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 -; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s2, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: and a0, s4, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: mv a1, s0 ; RV64I-NEXT: call fmaf ; RV64I-NEXT: call __truncsfhf2 @@ -1535,44 +1501,35 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a2 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui s3, 16 -; RV32I-NEXT: addi s3, s3, -1 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 16 +; RV32I-NEXT: addi s3, a0, -1 ; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s2, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui s4, 524288 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s2, s2, a1 +; RV32I-NEXT: xor s4, a0, a1 ; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 -; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: and a0, s2, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call fmaf ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -1593,44 +1550,35 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s1, a2 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lui s3, 16 -; RV64I-NEXT: addi s3, s3, -1 +; RV64I-NEXT: mv s0, a2 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 16 +; RV64I-NEXT: addi s3, a0, -1 ; RV64I-NEXT: and a0, a1, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s2, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui s4, 524288 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s2, s2, a1 +; RV64I-NEXT: xor s4, a0, a1 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 -; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s2, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: and a0, s4, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 +; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call fmaf ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -1960,25 +1908,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s2, a1, -1 +; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s3, a0, a1 +; RV32I-NEXT: and a0, s1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s2, s3 +; RV32I-NEXT: and a0, s3, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: mv a2, s0 @@ -2003,25 +1947,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv s0, a2 ; RV64I-NEXT: mv s1, a1 ; RV64I-NEXT: lui a1, 16 -; RV64I-NEXT: addi s3, a1, -1 -; RV64I-NEXT: and a0, a0, s3 +; RV64I-NEXT: addi s2, a1, -1 +; RV64I-NEXT: and a0, a0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s3, a0, a1 +; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 +; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: and a0, s2, s3 +; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: mv a2, s0 @@ -2096,25 +2036,21 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: addi s2, a0, -1 +; RV32I-NEXT: and a0, a1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s3, a0, a1 +; RV32I-NEXT: and a0, s1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s2, s3 +; RV32I-NEXT: and a0, s3, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: mv a0, s1 @@ -2140,25 +2076,21 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv s0, a2 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 16 -; RV64I-NEXT: addi s3, a0, -1 -; RV64I-NEXT: and a0, a1, s3 +; RV64I-NEXT: addi s2, a0, -1 +; RV64I-NEXT: and a0, a1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s3, a0, a1 +; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 +; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: and a0, s2, s3 +; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s1 @@ -2519,12 +2451,8 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __mulsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s1, a0, a1 ; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 @@ -2580,12 +2508,8 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __mulsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s1, a0, a1 ; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 diff --git a/llvm/test/CodeGen/RISCV/xqcicli.ll b/llvm/test/CodeGen/RISCV/xqcicli.ll index 8d4caa1..cdb1947 100644 --- a/llvm/test/CodeGen/RISCV/xqcicli.ll +++ b/llvm/test/CodeGen/RISCV/xqcicli.ll @@ -23,7 +23,8 @@ define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eq: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieq a0, a1, a2, 11 +; RV32IXQCI-NEXT: qc.selectine a1, a2, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %b, %x @@ -47,7 +48,8 @@ define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ne: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.line a0, a1, a2, 11 +; RV32IXQCI-NEXT: qc.selectieq a1, a2, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %b, %x @@ -167,7 +169,8 @@ define i32 @select_cc_example_eq_c(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eq_c: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.line a0, a1, a2, 11 +; RV32IXQCI-NEXT: qc.selectieq a1, a2, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %b, %x @@ -191,7 +194,8 @@ define i32 @select_cc_example_ne_c(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ne_c: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieq a0, a1, a2, 11 +; RV32IXQCI-NEXT: qc.selectine a1, a2, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %b, %x @@ -312,7 +316,8 @@ define i32 @select_cc_example_eqi(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eqi: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %b, 12 @@ -337,7 +342,8 @@ define i32 @select_cc_example_nei(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_nei: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %b, 12 @@ -462,7 +468,8 @@ define i32 @select_cc_example_eqi_c1(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eqi_c1: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 12, %b @@ -487,7 +494,8 @@ define i32 @select_cc_example_nei_c1(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_nei_c1: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 12, %b @@ -612,7 +620,8 @@ define i32 @select_cc_example_eqi_c2(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eqi_c2: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 12, %b @@ -637,7 +646,8 @@ define i32 @select_cc_example_nei_c2(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_nei_c2: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 12, %b @@ -762,7 +772,8 @@ define i32 @select_cc_example_eqi_c3(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eqi_c3: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %b, 12 @@ -787,7 +798,8 @@ define i32 @select_cc_example_nei_c3(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_nei_c3: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11 +; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11 +; RV32IXQCI-NEXT: mv a0, a1 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %b, 12 diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll index c0839c9..7656a0c 100644 --- a/llvm/test/CodeGen/RISCV/xqcics.ll +++ b/llvm/test/CodeGen/RISCV/xqcics.ll @@ -270,8 +270,7 @@ define i32 @select_cc_example_eqi(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eqi: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.line a2, a0, a1, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieq a0, a1, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %a, %b @@ -301,8 +300,7 @@ define i32 @select_cc_example_eqi_c(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eqi_c: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieq a2, a0, a1, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectine a0, a1, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %a, %b @@ -332,8 +330,7 @@ define i32 @select_cc_example_nei(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_nei: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieq a2, a0, a1, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectine a0, a1, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %a, %b @@ -363,8 +360,7 @@ define i32 @select_cc_example_nei_c(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_nei_c: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.line a2, a0, a1, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieq a0, a1, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %a, %b @@ -395,8 +391,7 @@ define i32 @select_cc_example_ieqi(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ieqi: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %a, 12 @@ -427,8 +422,7 @@ define i32 @select_cc_example_ieqi_c1(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ieqi_c1: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 12, %a @@ -459,8 +453,7 @@ define i32 @select_cc_example_ieqi_c2(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ieqi_c2: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %a, 12 @@ -491,8 +484,7 @@ define i32 @select_cc_example_ieqi_c3(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ieqi_c3: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 12, %a @@ -523,8 +515,7 @@ define i32 @select_cc_example_inei(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_inei: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %a, 12 @@ -555,8 +546,7 @@ define i32 @select_cc_example_inei_c1(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_inei_c1: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 12, %a @@ -587,8 +577,7 @@ define i32 @select_cc_example_inei_c2(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_inei_c2: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %a, 12 @@ -619,8 +608,7 @@ define i32 @select_cc_example_inei_c3(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_inei_c3: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 12, %a @@ -712,8 +700,7 @@ define i32 @select_cc_example_eq1(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_eq1: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.line a2, a1, a0, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectieq a0, a1, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp eq i32 %b, %a @@ -743,8 +730,7 @@ define i32 @select_cc_example_ne1(i32 %a, i32 %b, i32 %x, i32 %y) { ; ; RV32IXQCI-LABEL: select_cc_example_ne1: ; RV32IXQCI: # %bb.0: # %entry -; RV32IXQCI-NEXT: qc.lieq a2, a1, a0, 11 -; RV32IXQCI-NEXT: mv a0, a2 +; RV32IXQCI-NEXT: qc.selectine a0, a1, a2, 11 ; RV32IXQCI-NEXT: ret entry: %cmp = icmp ne i32 %b, %a diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll index 8d52fe5..3c35a29 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -63,34 +63,30 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: mov r4, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] +; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] ; CHECK-MVE-NEXT: vmov q5, q1 ; CHECK-MVE-NEXT: vmov q4, q0 ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] +; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q5[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q5[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r6, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r1, r6 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: vmov.16 q6[0], r5 -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: vmov.16 q6[1], r0 +; CHECK-MVE-NEXT: bfc r0, #15, #17 +; CHECK-MVE-NEXT: bfc r5, #15, #17 +; CHECK-MVE-NEXT: vmov.16 q6[0], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[2] +; CHECK-MVE-NEXT: vmov.16 q6[1], r5 ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r5, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q4[2] @@ -98,9 +94,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[2], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[3] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -110,9 +104,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[3], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[4] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -122,9 +114,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[4], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[5] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -134,9 +124,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[5], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[6] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -146,9 +134,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[6], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[7] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -158,9 +144,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[7], r0 ; CHECK-MVE-NEXT: vstrw.32 q6, [r4] ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11, d12, d13} diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll new file mode 100644 index 0000000..2aea9c1 --- /dev/null +++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll @@ -0,0 +1,38 @@ +;; Test if a potential indirect call target function which has internal linkage and +;; address taken has its type ID emitted to callgraph section. +;; This test also makes sure that callback functions which meet the above constraint +;; are handled correctly. + +; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s + +declare !type !0 void @_Z6doWorkPFviE(ptr) + +define i32 @_Z4testv() !type !1 { +entry: + call void @_Z6doWorkPFviE(ptr nonnull @_ZL10myCallbacki) + ret i32 0 +} + +; CHECK: _ZL10myCallbacki: +; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: +define internal void @_ZL10myCallbacki(i32 %value) !type !2 { +entry: + %sink = alloca i32, align 4 + store volatile i32 %value, ptr %sink, align 4 + %i1 = load volatile i32, ptr %sink, align 4 + ret void +} + +!0 = !{i64 0, !"_ZTSFvPFviEE.generalized"} +!1 = !{i64 0, !"_ZTSFivE.generalized"} +!2 = !{i64 0, !"_ZTSFviE.generalized"} + +; CHECK: .section .callgraph,"o",@progbits,.text +;; Version +; CHECK-NEXT: .byte 0 +;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0. +; CHECK-NEXT: .byte 1 +;; Function Entry PC +; CHECK-NEXT: .quad [[LABEL_FUNC]] +;; Function type ID +; CHECK-NEXT: .quad -5212364466660467813 diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index f0dbc31..1aabf66 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -15,16 +15,13 @@ declare !type !2 ptr @direct_baz(ptr) define ptr @ball() { entry: call void @direct_foo() - %fp_foo_val = load ptr, ptr null, align 8 - ; CHECK: [[LABEL_TMP0:\.L.*]]: + %fp_foo_val = load ptr, ptr null, align 8 call void (...) %fp_foo_val(), !callee_type !0 call void @direct_foo() - %fp_bar_val = load ptr, ptr null, align 8 - ; CHECK: [[LABEL_TMP1:\.L.*]]: + %fp_bar_val = load ptr, ptr null, align 8 %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2 %call_fp_bar_direct = call i32 @direct_bar(i8 1) %fp_baz_val = load ptr, ptr null, align 8 - ; CHECK: [[LABEL_TMP2:\.L.*]]: %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4 call void @direct_foo() %call_fp_baz_direct = call ptr @direct_baz(ptr null) @@ -32,29 +29,31 @@ entry: ret ptr %call_fp_baz } -; CHECK: .section .callgraph,"o",@progbits,.text - -; CHECK-NEXT: .quad 0 -; CHECK-NEXT: .quad [[LABEL_FUNC]] -; CHECK-NEXT: .quad 1 -; CHECK-NEXT: .quad 3 !0 = !{!1} !1 = !{i64 0, !"_ZTSFvE.generalized"} -;; Test for MD5 hash of _ZTSFvE.generalized and the generated temporary callsite label. -; CHECK-NEXT: .quad 4524972987496481828 -; CHECK-NEXT: .quad [[LABEL_TMP0]] !2 = !{!3} !3 = !{i64 0, !"_ZTSFicE.generalized"} -;; Test for MD5 hash of _ZTSFicE.generalized and the generated temporary callsite label. -; CHECK-NEXT: .quad 3498816979441845844 -; CHECK-NEXT: .quad [[LABEL_TMP1]] !4 = !{!5} !5 = !{i64 0, !"_ZTSFPvS_E.generalized"} -;; Test for MD5 hash of _ZTSFPvS_E.generalized and the generated temporary callsite label. -; CHECK-NEXT: .quad 8646233951371320954 -; CHECK-NEXT: .quad [[LABEL_TMP2]] -;; Test for number of direct calls and {callsite_label, callee} pairs. -; CHECK-NEXT: .quad 3 + +; CHECK: .section .callgraph,"o",@progbits,.text +;; Version +; CHECK-NEXT: .byte 0 +;; Flags +; CHECK-NEXT: .byte 7 +;; Function Entry PC +; CHECK-NEXT: .quad [[LABEL_FUNC]] +;; Function type ID -- set to 0 as no type metadata attached to function. +; CHECK-NEXT: .quad 0 +;; Number of unique direct callees. +; CHECK-NEXT: .byte 3 +;; Direct callees. ; CHECK-NEXT: .quad direct_foo ; CHECK-NEXT: .quad direct_bar ; CHECK-NEXT: .quad direct_baz +;; Number of unique indirect target type IDs. +; CHECK-NEXT: .byte 3 +;; Indirect type IDs. +; CHECK-NEXT: .quad 4524972987496481828 +; CHECK-NEXT: .quad 3498816979441845844 +; CHECK-NEXT: .quad 8646233951371320954 diff --git a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll index fa14a98..34dc5b8 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll @@ -22,13 +22,14 @@ declare !type !2 i32 @foo(i8 signext) declare !type !2 i32 @bar(i8 signext) -;; Check that the numeric type id (md5 hash) for the below type ids are emitted -;; to the callgraph section. - -; CHECK: Hex dump of section '.callgraph': - !0 = !{i64 0, !"_ZTSFiPvcE.generalized"} !1 = !{!2} -; CHECK-DAG: 5486bc59 814b8e30 !2 = !{i64 0, !"_ZTSFicE.generalized"} !3 = !{i64 0, !"_ZTSFiiE.generalized"} + +; CHECK: Hex dump of section '.callgraph': +; CHECK-NEXT: 0x00000000 00050000 00000000 00008e19 0b7f3326 +; CHECK-NEXT: 0x00000010 e3000154 86bc5981 4b8e3000 05000000 +;; Verify that the type id 0x308e4b8159bc8654 is in section. +; CHECK-NEXT: 0x00000020 00000000 00a150b8 3e0cfe3c b2015486 +; CHECK-NEXT: 0x00000030 bc59814b 8e30 diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll index 66d009c..c144a24 100644 --- a/llvm/test/CodeGen/X86/call-graph-section.ll +++ b/llvm/test/CodeGen/X86/call-graph-section.ll @@ -22,15 +22,16 @@ entry: ;; Check that the numeric type id (md5 hash) for the below type ids are emitted ;; to the callgraph section. - -; CHECK: Hex dump of section '.callgraph': - -; CHECK-DAG: 2444f731 f5eecb3e !0 = !{i64 0, !"_ZTSFvE.generalized"} !1 = !{!0} -; CHECK-DAG: 5486bc59 814b8e30 !2 = !{i64 0, !"_ZTSFicE.generalized"} !3 = !{!2} -; CHECK-DAG: 7ade6814 f897fd77 !4 = !{!5} !5 = !{i64 0, !"_ZTSFPvS_E.generalized"} + +;; Make sure following type IDs are in call graph section +;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814 +; CHECK: Hex dump of section '.callgraph': +; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000000 +; CHECK-NEXT: 0x00000010 00000324 44f731f5 eecb3e54 86bc5981 +; CHECK-NEXT: 0x00000020 4b8e307a de6814f8 97fd77 diff --git a/llvm/test/CodeGen/X86/fast-isel-fneg.ll b/llvm/test/CodeGen/X86/fast-isel-fneg.ll deleted file mode 100644 index 128f5ee..0000000 --- a/llvm/test/CodeGen/X86/fast-isel-fneg.ll +++ /dev/null @@ -1,101 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -fast-isel -fast-isel-abort=3 -mtriple=x86_64-apple-darwin10 | FileCheck %s -; RUN: llc < %s -fast-isel -mtriple=i686-- -mattr=+sse2 | FileCheck --check-prefix=SSE2 %s - -define double @fneg_f64(double %x) nounwind { -; CHECK-LABEL: fneg_f64: -; CHECK: ## %bb.0: -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000 -; CHECK-NEXT: xorq %rax, %rcx -; CHECK-NEXT: movq %rcx, %xmm0 -; CHECK-NEXT: retq -; -; SSE2-LABEL: fneg_f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pushl %ebp -; SSE2-NEXT: movl %esp, %ebp -; SSE2-NEXT: andl $-8, %esp -; SSE2-NEXT: subl $8, %esp -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE2-NEXT: movlps %xmm0, (%esp) -; SSE2-NEXT: fldl (%esp) -; SSE2-NEXT: movl %ebp, %esp -; SSE2-NEXT: popl %ebp -; SSE2-NEXT: retl - %y = fneg double %x - ret double %y -} - -define float @fneg_f32(float %x) nounwind { -; CHECK-LABEL: fneg_f32: -; CHECK: ## %bb.0: -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: xorl $2147483648, %eax ## imm = 0x80000000 -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retq -; -; SSE2-LABEL: fneg_f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pushl %eax -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE2-NEXT: movss %xmm0, (%esp) -; SSE2-NEXT: flds (%esp) -; SSE2-NEXT: popl %eax -; SSE2-NEXT: retl - %y = fneg float %x - ret float %y -} - -define void @fneg_f64_mem(ptr %x, ptr %y) nounwind { -; CHECK-LABEL: fneg_f64_mem: -; CHECK: ## %bb.0: -; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000 -; CHECK-NEXT: xorq %rax, %rcx -; CHECK-NEXT: movq %rcx, %xmm0 -; CHECK-NEXT: movq %xmm0, (%rsi) -; CHECK-NEXT: retq -; -; SSE2-LABEL: fneg_f64_mem: -; SSE2: # %bb.0: -; SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE2-NEXT: movsd %xmm0, (%eax) -; SSE2-NEXT: retl - %a = load double, ptr %x - %b = fneg double %a - store double %b, ptr %y - ret void -} - -define void @fneg_f32_mem(ptr %x, ptr %y) nounwind { -; CHECK-LABEL: fneg_f32_mem: -; CHECK: ## %bb.0: -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: xorl $2147483648, %eax ## imm = 0x80000000 -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movd %xmm0, (%rsi) -; CHECK-NEXT: retq -; -; SSE2-LABEL: fneg_f32_mem: -; SSE2: # %bb.0: -; SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: xorl $2147483648, %ecx # imm = 0x80000000 -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movd %xmm0, (%eax) -; SSE2-NEXT: retl - %a = load float, ptr %x - %b = fneg float %a - store float %b, ptr %y - ret void -} diff --git a/llvm/test/CodeGen/X86/fp-int-fp-cvt.ll b/llvm/test/CodeGen/X86/fp-int-fp-cvt.ll new file mode 100644 index 0000000..b6c17ce --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-int-fp-cvt.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512,AVX512-VL +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 -mattr=-avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512-NOVL + +; +; fptosi -> sitofp +; + +define double @scvtf64_i32(double %a0) { +; SSE-LABEL: scvtf64_i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: scvtf64_i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq + %ii = fptosi double %a0 to i32 + %ff = sitofp i32 %ii to double + ret double %ff +} + +define double @scvtf64_i64(double %a0) { +; SSE-LABEL: scvtf64_i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: scvtf64_i64: +; AVX: # %bb.0: +; AVX-NEXT: vcvttsd2si %xmm0, %rax +; AVX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 +; AVX-NEXT: retq + %ii = fptosi double %a0 to i64 + %ff = sitofp i64 %ii to double + ret double %ff +} + +define float @scvtf32_i32(float %a0) { +; SSE-LABEL: scvtf32_i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: scvtf32_i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 +; AVX-NEXT: retq + %ii = fptosi float %a0 to i32 + %ff = sitofp i32 %ii to float + ret float %ff +} + +define float @scvtf32_i64(float %a0) { +; SSE-LABEL: scvtf32_i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: scvtf32_i64: +; AVX: # %bb.0: +; AVX-NEXT: vcvttss2si %xmm0, %rax +; AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; AVX-NEXT: retq + %ii = fptosi float %a0 to i64 + %ff = sitofp i64 %ii to float + ret float %ff +} + +; +; fptoui -> uitofp +; + +define double @ucvtf64_i32(double %a0) { +; SSE-LABEL: ucvtf64_i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movl %eax, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2sd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: ucvtf64_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: movl %eax, %eax +; AVX2-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ucvtf64_i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm0 +; AVX512-NEXT: retq + %ii = fptoui double %a0 to i32 + %ff = uitofp i32 %ii to double + ret double %ff +} + +define double @ucvtf64_i64(double %a0) { +; SSE-LABEL: ucvtf64_i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rdx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: ucvtf64_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvttsd2si %xmm0, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: sarq $63, %rcx +; AVX2-NEXT: vcvttsd2si %xmm0, %rdx +; AVX2-NEXT: andq %rcx, %rdx +; AVX2-NEXT: orq %rax, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ucvtf64_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 +; AVX512-NEXT: retq + %ii = fptoui double %a0 to i64 + %ff = uitofp i64 %ii to double + ret double %ff +} + +define float @ucvtf32_i32(float %a0) { +; SSE-LABEL: ucvtf32_i32: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movl %eax, %eax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: ucvtf32_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: movl %eax, %eax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ucvtf32_i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttss2usi %xmm0, %eax +; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 +; AVX512-NEXT: retq + %ii = fptoui float %a0 to i32 + %ff = uitofp i32 %ii to float + ret float %ff +} + +define float @ucvtf32_i64(float %a0) { +; SSE-LABEL: ucvtf32_i64: +; SSE: # %bb.0: +; SSE-NEXT: cvttss2si %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rdx +; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: andq %rdx, %rax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: js .LBB7_1 +; SSE-NEXT: # %bb.2: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: retq +; SSE-NEXT: .LBB7_1: +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: shrq %rcx +; SSE-NEXT: andl $1, %eax +; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ss %rax, %xmm0 +; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: ucvtf32_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vcvttss2si %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: sarq $63, %rdx +; AVX2-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vcvttss2si %xmm0, %rax +; AVX2-NEXT: andq %rdx, %rax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: js .LBB7_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ucvtf32_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvttss2usi %xmm0, %rax +; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 +; AVX512-NEXT: retq + %ii = fptoui float %a0 to i64 + %ff = uitofp i64 %ii to float + ret float %ff +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512-NOVL: {{.*}} +; AVX512-VL: {{.*}} diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll new file mode 100644 index 0000000..a0c243b --- /dev/null +++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll @@ -0,0 +1,43 @@ +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; A minimal test case. llc will crash if global variables already has a section +;; prefix. Subsequent PRs will expand on this test case to test the hotness +;; reconciliation implementation. + +; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +; RUN: -partition-static-data-sections=true \ +; RUN: -data-sections=true -unique-section-names=false \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=ERR + +; ERR: Global variable hot_bss already has a section prefix hot + +@hot_bss = internal global i32 0, !section_prefix !17 + +define void @hot_func() !prof !14 { + %9 = load i32, ptr @hot_bss + %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9) + ret void +} + +declare i32 @func_taking_arbitrary_param(...) + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 1460183} +!5 = !{!"MaxCount", i64 849024} +!6 = !{!"MaxInternalCount", i64 32769} +!7 = !{!"MaxFunctionCount", i64 849024} +!8 = !{!"NumCounts", i64 23627} +!9 = !{!"NumFunctions", i64 3271} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13} +!12 = !{i32 990000, i64 166, i32 73} +!13 = !{i32 999999, i64 3, i32 1443} +!14 = !{!"function_entry_count", i64 100000} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 1, i32 99999} +!17 = !{!"section_prefix", !"hot"} diff --git a/llvm/test/CodeGen/X86/isel-fneg.ll b/llvm/test/CodeGen/X86/isel-fneg.ll new file mode 100644 index 0000000..77b3f26 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-fneg.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86 +; DISABLED: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,FASTISEL-SSE-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,SDAG-SSE-X86 +; DISABLED: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,GISEL-SSE-X86 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,FASTISEL-SSE-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,SDAG-SSE-X64 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,GISEL-SSE-X64 + +define double @fneg_f64(double %x) nounwind { +; X86-LABEL: fneg_f64: +; X86: # %bb.0: +; X86-NEXT: fldl {{[0-9]+}}(%esp) +; X86-NEXT: fchs +; X86-NEXT: retl +; +; FASTISEL-SSE-X64-LABEL: fneg_f64: +; FASTISEL-SSE-X64: # %bb.0: +; FASTISEL-SSE-X64-NEXT: movq %xmm0, %rax +; FASTISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; FASTISEL-SSE-X64-NEXT: xorq %rax, %rcx +; FASTISEL-SSE-X64-NEXT: movq %rcx, %xmm0 +; FASTISEL-SSE-X64-NEXT: retq +; +; SDAG-SSE-X64-LABEL: fneg_f64: +; SDAG-SSE-X64: # %bb.0: +; SDAG-SSE-X64-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SDAG-SSE-X64-NEXT: retq +; +; GISEL-SSE-X64-LABEL: fneg_f64: +; GISEL-SSE-X64: # %bb.0: +; GISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; GISEL-SSE-X64-NEXT: movq %xmm0, %rcx +; GISEL-SSE-X64-NEXT: xorq %rax, %rcx +; GISEL-SSE-X64-NEXT: movq %rcx, %xmm0 +; GISEL-SSE-X64-NEXT: retq + %y = fneg double %x + ret double %y +} + +define float @fneg_f32(float %x) nounwind { +; FASTISEL-X86-LABEL: fneg_f32: +; FASTISEL-X86: # %bb.0: +; FASTISEL-X86-NEXT: flds {{[0-9]+}}(%esp) +; FASTISEL-X86-NEXT: fchs +; FASTISEL-X86-NEXT: retl +; +; SDAG-X86-LABEL: fneg_f32: +; SDAG-X86: # %bb.0: +; SDAG-X86-NEXT: flds {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: fchs +; SDAG-X86-NEXT: retl +; +; SSE-X86-LABEL: fneg_f32: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: pushl %eax +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: flds (%esp) +; SSE-X86-NEXT: popl %eax +; SSE-X86-NEXT: retl +; +; FASTISEL-SSE-X64-LABEL: fneg_f32: +; FASTISEL-SSE-X64: # %bb.0: +; FASTISEL-SSE-X64-NEXT: movd %xmm0, %eax +; FASTISEL-SSE-X64-NEXT: xorl $2147483648, %eax # imm = 0x80000000 +; FASTISEL-SSE-X64-NEXT: movd %eax, %xmm0 +; FASTISEL-SSE-X64-NEXT: retq +; +; SDAG-SSE-X64-LABEL: fneg_f32: +; SDAG-SSE-X64: # %bb.0: +; SDAG-SSE-X64-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SDAG-SSE-X64-NEXT: retq +; +; GISEL-SSE-X64-LABEL: fneg_f32: +; GISEL-SSE-X64: # %bb.0: +; GISEL-SSE-X64-NEXT: movd %xmm0, %eax +; GISEL-SSE-X64-NEXT: addl $-2147483648, %eax # imm = 0x80000000 +; GISEL-SSE-X64-NEXT: movd %eax, %xmm0 +; GISEL-SSE-X64-NEXT: retq + %y = fneg float %x + ret float %y +} + +define void @fneg_f64_mem(ptr %x, ptr %y) nounwind { +; X86-LABEL: fneg_f64_mem: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: fldl (%ecx) +; X86-NEXT: fchs +; X86-NEXT: fstpl (%eax) +; X86-NEXT: retl +; +; FASTISEL-SSE-X64-LABEL: fneg_f64_mem: +; FASTISEL-SSE-X64: # %bb.0: +; FASTISEL-SSE-X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; FASTISEL-SSE-X64-NEXT: movq %xmm0, %rax +; FASTISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; FASTISEL-SSE-X64-NEXT: xorq %rax, %rcx +; FASTISEL-SSE-X64-NEXT: movq %rcx, %xmm0 +; FASTISEL-SSE-X64-NEXT: movq %xmm0, (%rsi) +; FASTISEL-SSE-X64-NEXT: retq +; +; SDAG-SSE-X64-LABEL: fneg_f64_mem: +; SDAG-SSE-X64: # %bb.0: +; SDAG-SSE-X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SDAG-SSE-X64-NEXT: xorq (%rdi), %rax +; SDAG-SSE-X64-NEXT: movq %rax, (%rsi) +; SDAG-SSE-X64-NEXT: retq +; +; GISEL-SSE-X64-LABEL: fneg_f64_mem: +; GISEL-SSE-X64: # %bb.0: +; GISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; GISEL-SSE-X64-NEXT: xorq (%rdi), %rax +; GISEL-SSE-X64-NEXT: movq %rax, (%rsi) +; GISEL-SSE-X64-NEXT: retq + %a = load double, ptr %x + %b = fneg double %a + store double %b, ptr %y + ret void +} + +define void @fneg_f32_mem(ptr %x, ptr %y) nounwind { +; FASTISEL-X86-LABEL: fneg_f32_mem: +; FASTISEL-X86: # %bb.0: +; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FASTISEL-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; FASTISEL-X86-NEXT: xorl (%ecx), %edx +; FASTISEL-X86-NEXT: movl %edx, (%eax) +; FASTISEL-X86-NEXT: retl +; +; SDAG-X86-LABEL: fneg_f32_mem: +; SDAG-X86: # %bb.0: +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SDAG-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; SDAG-X86-NEXT: xorl (%ecx), %edx +; SDAG-X86-NEXT: movl %edx, (%eax) +; SDAG-X86-NEXT: retl +; +; FASTISEL-SSE-X86-LABEL: fneg_f32_mem: +; FASTISEL-SSE-X86: # %bb.0: +; FASTISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FASTISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FASTISEL-SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FASTISEL-SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; FASTISEL-SSE-X86-NEXT: movss %xmm0, (%eax) +; FASTISEL-SSE-X86-NEXT: retl +; +; SDAG-SSE-X86-LABEL: fneg_f32_mem: +; SDAG-SSE-X86: # %bb.0: +; SDAG-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SDAG-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SDAG-SSE-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; SDAG-SSE-X86-NEXT: xorl (%ecx), %edx +; SDAG-SSE-X86-NEXT: movl %edx, (%eax) +; SDAG-SSE-X86-NEXT: retl +; +; FASTISEL-SSE-X64-LABEL: fneg_f32_mem: +; FASTISEL-SSE-X64: # %bb.0: +; FASTISEL-SSE-X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FASTISEL-SSE-X64-NEXT: movd %xmm0, %eax +; FASTISEL-SSE-X64-NEXT: xorl $2147483648, %eax # imm = 0x80000000 +; FASTISEL-SSE-X64-NEXT: movd %eax, %xmm0 +; FASTISEL-SSE-X64-NEXT: movd %xmm0, (%rsi) +; FASTISEL-SSE-X64-NEXT: retq +; +; SDAG-SSE-X64-LABEL: fneg_f32_mem: +; SDAG-SSE-X64: # %bb.0: +; SDAG-SSE-X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; SDAG-SSE-X64-NEXT: xorl (%rdi), %eax +; SDAG-SSE-X64-NEXT: movl %eax, (%rsi) +; SDAG-SSE-X64-NEXT: retq +; +; GISEL-SSE-X64-LABEL: fneg_f32_mem: +; GISEL-SSE-X64: # %bb.0: +; GISEL-SSE-X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; GISEL-SSE-X64-NEXT: xorl (%rdi), %eax +; GISEL-SSE-X64-NEXT: movl %eax, (%rsi) +; GISEL-SSE-X64-NEXT: retq + %a = load float, ptr %x + %b = fneg float %a + store float %b, ptr %y + ret void +} + +define x86_fp80 @test_fp80(x86_fp80 %a) nounwind { +; X86-LABEL: test_fp80: +; X86: # %bb.0: +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fchs +; X86-NEXT: retl +; +; X64-LABEL: test_fp80: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fchs +; X64-NEXT: retq + %1 = fneg x86_fp80 %a + ret x86_fp80 %1 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE-X64: {{.*}} diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll new file mode 100644 index 0000000..4ea3101 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr162812.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE42 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512 + +define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) { +; SSE2-LABEL: PR162812: +; SSE2: # %bb.0: +; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: paddb %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: paddb %xmm3, %xmm3 +; SSE2-NEXT: paddb %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: PR162812: +; SSE42: # %bb.0: +; SSE42-NEXT: movdqa %xmm2, %xmm5 +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm6 +; SSE42-NEXT: psllw $2, %xmm6 +; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; SSE42-NEXT: pand %xmm7, %xmm6 +; SSE42-NEXT: psrlw $2, %xmm5 +; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224] +; SSE42-NEXT: pand %xmm4, %xmm5 +; SSE42-NEXT: paddb %xmm5, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: movdqa %xmm2, %xmm6 +; SSE42-NEXT: paddb %xmm2, %xmm6 +; SSE42-NEXT: paddb %xmm5, %xmm5 +; SSE42-NEXT: movdqa %xmm5, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm6, %xmm2 +; SSE42-NEXT: movdqa %xmm1, %xmm5 +; SSE42-NEXT: psllw $2, %xmm5 +; SSE42-NEXT: pand %xmm7, %xmm5 +; SSE42-NEXT: psrlw $2, %xmm3 +; SSE42-NEXT: pand %xmm3, %xmm4 +; SSE42-NEXT: paddb %xmm4, %xmm4 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm3 +; SSE42-NEXT: paddb %xmm1, %xmm3 +; SSE42-NEXT: paddb %xmm4, %xmm4 +; SSE42-NEXT: movdqa %xmm4, %xmm0 +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm0 +; SSE42-NEXT: retq +; +; AVX2-LABEL: PR162812: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR162812: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq + %1 = lshr <32 x i8> %mask, splat (i8 7) + %ret = shl <32 x i8> %a, %1 + ret <32 x i8> %ret +} diff --git a/llvm/test/CodeGen/X86/stack-protector-target.ll b/llvm/test/CodeGen/X86/stack-protector-target.ll index f7c5680..4ba0302 100644 --- a/llvm/test/CodeGen/X86/stack-protector-target.ll +++ b/llvm/test/CodeGen/X86/stack-protector-target.ll @@ -2,13 +2,8 @@ ; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=I386-TLS %s ; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=X64-TLS %s -; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=I386 %s -; RUN: llc -mtriple=i386-linux-android16 < %s -o - | FileCheck --check-prefix=I386 %s -; RUN: llc -mtriple=i386-linux-android17 < %s -o - | FileCheck --check-prefix=I386-TLS %s -; RUN: llc -mtriple=i386-linux-android24 < %s -o - | FileCheck --check-prefix=I386-TLS %s +; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=I386-TLS %s ; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=X64-TLS %s -; RUN: llc -mtriple=x86_64-linux-android17 < %s -o - | FileCheck --check-prefix=X64-TLS %s -; RUN: llc -mtriple=x86_64-linux-android24 < %s -o - | FileCheck --check-prefix=X64-TLS %s ; RUN: llc -mtriple=i386-kfreebsd < %s -o - | FileCheck --check-prefix=I386-TLS %s ; RUN: llc -mtriple=x86_64-kfreebsd < %s -o - | FileCheck --check-prefix=X64-TLS %s @@ -27,11 +22,6 @@ declare void @_Z7CapturePi(ptr) ; X64-TLS: movq %fs:40, %[[C:.*]] ; X64-TLS: cmpq 16(%rsp), %[[C]] -; I386: movl __stack_chk_guard, %[[B:.*]] -; I386: movl %[[B]], 8(%esp) -; I386: movl __stack_chk_guard, %[[C:.*]] -; I386: cmpl 8(%esp), %[[C]] - ; I386-TLS: movl %gs:20, %[[B:.*]] ; I386-TLS: movl %[[B]], 8(%esp) ; I386-TLS: movl %gs:20, %[[C:.*]] |